From b2cb4180ac6234820117d5a881e48c700aee9d19 Mon Sep 17 00:00:00 2001 From: Raymond Douglass Date: Thu, 25 Mar 2021 14:22:44 -0400 Subject: [PATCH 01/59] DOC v0.20 Updates --- CHANGELOG.md | 4 ++++ conda/environments/cudf_dev_cuda10.1.yml | 2 +- conda/environments/cudf_dev_cuda10.2.yml | 2 +- conda/environments/cudf_dev_cuda11.0.yml | 2 +- cpp/doxygen/Doxyfile | 4 ++-- cpp/libcudf_kafka/CMakeLists.txt | 2 +- docs/cudf/source/conf.py | 4 ++-- 7 files changed, 12 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 21ab8ed3274..df002654aa7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# cuDF 0.20.0 (Date TBD) + +Please see https://github.com/rapidsai/cudf/releases/tag/v0.20.0a for the latest changes to this development branch. + # cuDF 0.19.0 (Date TBD) Please see https://github.com/rapidsai/cudf/releases/tag/v0.19.0a for the latest changes to this development branch. diff --git a/conda/environments/cudf_dev_cuda10.1.yml b/conda/environments/cudf_dev_cuda10.1.yml index 35108ddd8ca..8c7126f81d1 100644 --- a/conda/environments/cudf_dev_cuda10.1.yml +++ b/conda/environments/cudf_dev_cuda10.1.yml @@ -11,7 +11,7 @@ dependencies: - clang=8.0.1 - clang-tools=8.0.1 - cupy>7.1.0,<9.0.0a0 - - rmm=0.19.* + - rmm=0.20.* - cmake>=3.14 - cmake_setuptools>=0.1.3 - python>=3.6,<3.8 diff --git a/conda/environments/cudf_dev_cuda10.2.yml b/conda/environments/cudf_dev_cuda10.2.yml index 3a24e38a397..3e81904633e 100644 --- a/conda/environments/cudf_dev_cuda10.2.yml +++ b/conda/environments/cudf_dev_cuda10.2.yml @@ -11,7 +11,7 @@ dependencies: - clang=8.0.1 - clang-tools=8.0.1 - cupy>7.1.0,<9.0.0a0 - - rmm=0.19.* + - rmm=0.20.* - cmake>=3.14 - cmake_setuptools>=0.1.3 - python>=3.6,<3.8 diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml index 821c6f5320d..72b9d134b62 100644 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ b/conda/environments/cudf_dev_cuda11.0.yml @@ -11,7 +11,7 @@ dependencies: - clang=8.0.1 - clang-tools=8.0.1 - cupy>7.1.0,<9.0.0a0 - - rmm=0.19.* + - rmm=0.20.* - cmake>=3.14 - cmake_setuptools>=0.1.3 - python>=3.6,<3.8 diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile index 8fde8098bd3..eaa632860e5 100644 --- a/cpp/doxygen/Doxyfile +++ b/cpp/doxygen/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = 0.19.0 +PROJECT_NUMBER = 0.20.0 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a @@ -2167,7 +2167,7 @@ SKIP_FUNCTION_MACROS = YES # the path). If a tag file is not located in the directory in which doxygen is # run, you must also specify the path to the tagfile here. -TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/0.19 +TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/0.20 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a # tag file that is based on the input files it reads. See section "Linking to diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index e178f5a6280..2f7fa5fc0fe 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -15,7 +15,7 @@ #============================================================================= cmake_minimum_required(VERSION 3.18 FATAL_ERROR) -project(CUDA_KAFKA VERSION 0.19.0 LANGUAGES CXX) +project(CUDA_KAFKA VERSION 0.20.0 LANGUAGES CXX) ################################################################################################### # - Build options diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index b68d7b5849f..18ffbacca1f 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -77,9 +77,9 @@ # built documents. # # The short X.Y version. -version = "0.19" +version = '0.20' # The full version, including alpha/beta/rc tags. -release = "0.19.0" +release = '0.20.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. From f285302877ca5ae94a0c8bf0a2c9ee34a1e4cd8b Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 31 Mar 2021 05:44:21 -0700 Subject: [PATCH 02/59] Adds `list.unique` API (#7664) Closes #7414 This PR adds `list.unique` API. Following `Series.unique` behavior, this API treats null values as equal, and treats all nans as equal. This API does not guarantee the order of list elements. Example: ```python >>> s = cudf.Series([[1, 1, 2, None, None], None, [np.nan, np.nan], []]) >>> s.list.unique() # Order of list elements is not gaurenteed 0 [1.0, 2.0, nan] 1 None 2 [nan] 3 [] dtype: list ``` Authors: - Michael Wang (@isVoid) Approvers: - Keith Kraus (@kkraus14) - Nghia Truong (@ttnghia) URL: https://github.com/rapidsai/cudf/pull/7664 --- .../_lib/cpp/lists/drop_list_duplicates.pxd | 15 +++++++ python/cudf/cudf/_lib/cpp/types.pxd | 4 ++ python/cudf/cudf/_lib/lists.pyx | 40 ++++++++++++++++++- python/cudf/cudf/core/column/lists.py | 36 +++++++++++++++++ python/cudf/cudf/tests/test_list.py | 34 ++++++++++++++++ 5 files changed, 127 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd diff --git a/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd b/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd new file mode 100644 index 00000000000..40b1836f932 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.types cimport null_equality, nan_equality + +cdef extern from "cudf/lists/drop_list_duplicates.hpp" \ + namespace "cudf::lists" nogil: + cdef unique_ptr[column] drop_list_duplicates( + const lists_column_view lists_column, + null_equality nulls_equal, + nan_equality nans_equal + ) except + diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd index bd1108b2cdf..1f2094b3958 100644 --- a/python/cudf/cudf/_lib/cpp/types.pxd +++ b/python/cudf/cudf/_lib/cpp/types.pxd @@ -46,6 +46,10 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: EQUAL "cudf::null_equality::EQUAL" UNEQUAL "cudf::null_equality::UNEQUAL" + ctypedef enum nan_equality "cudf::nan_equality": + ALL_EQUAL "cudf::nan_equality::ALL_EQUAL" + UNEQUAL "cudf::nan_equality::UNEQUAL" + ctypedef enum type_id "cudf::type_id": EMPTY "cudf::type_id::EMPTY" INT8 "cudf::type_id::INT8" diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 7f745e58c67..e93cba20f65 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -10,6 +10,9 @@ from cudf._lib.cpp.lists.count_elements cimport ( from cudf._lib.cpp.lists.explode cimport ( explode_outer as cpp_explode_outer ) +from cudf._lib.cpp.lists.drop_list_duplicates cimport ( + drop_list_duplicates as cpp_drop_list_duplicates +) from cudf._lib.cpp.lists.sorting cimport ( sort_lists as cpp_sort_lists ) @@ -22,7 +25,13 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport size_type, order, null_order +from cudf._lib.cpp.types cimport ( + size_type, + null_equality, + order, + null_order, + nan_equality +) from cudf._lib.column cimport Column from cudf._lib.table cimport Table @@ -71,6 +80,34 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): ) +def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal): + """ + nans_all_equal == True indicates that libcudf should treat any two elements + from {+nan, -nan} as equal, and as unequal otherwise. + nulls_equal == True indicates that libcudf should treat any two nulls as + equal, and as unequal otherwise. + """ + cdef shared_ptr[lists_column_view] list_view = ( + make_shared[lists_column_view](col.view()) + ) + cdef null_equality c_nulls_equal = ( + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL + ) + cdef nan_equality c_nans_equal = ( + nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL + ) + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_drop_list_duplicates(list_view.get()[0], + c_nulls_equal, + c_nans_equal) + ) + return Column.from_unique_ptr(move(c_result)) + + def sort_lists(Column col, bool ascending, str na_position): cdef shared_ptr[lists_column_view] list_view = ( make_shared[lists_column_view](col.view()) @@ -121,6 +158,5 @@ def contains_scalar(Column col, DeviceScalar search_key): list_view.get()[0], search_key_value[0], )) - result = Column.from_unique_ptr(move(c_result)) return result diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index b7f34e8c007..364675cd035 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -10,6 +10,7 @@ from cudf._lib.lists import ( contains_scalar, count_elements, + drop_list_duplicates, extract_element, sort_lists, ) @@ -361,6 +362,41 @@ def take(self, lists_indices): else: return res + def unique(self): + """ + Returns unique element for each list in the column, order for each + unique element is not guaranteed. + + Returns + ------- + ListColumn + + Examples + -------- + >>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []]) + >>> s + 0 [1.0, 1.0, 2.0, nan, nan] + 1 None + 2 [4.0, 4.0] + 3 [] + dtype: list + >>> s.list.unique() # Order of list element is not guaranteed + 0 [1.0, 2.0, nan] + 1 None + 2 [4.0] + 3 [] + dtype: list + """ + + if is_list_dtype(self._column.children[1].dtype): + raise NotImplementedError("Nested lists unique is not supported.") + + return self._return_or_inplace( + drop_list_duplicates( + self._column, nulls_equal=True, nans_all_equal=True + ) + ) + def sort_values( self, ascending=True, diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 5645ce60596..9906600304b 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,6 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import functools +import numpy as np import pandas as pd import pyarrow as pa import pytest @@ -162,6 +163,39 @@ def test_take_invalid(invalid, exception): gs.list.take(invalid) +@pytest.mark.parametrize( + ("data", "expected"), + [ + ([[1, 1, 2, 2], [], None, [3, 4, 5]], [[1, 2], [], None, [3, 4, 5]]), + ( + [[1.233, np.nan, 1.234, 3.141, np.nan, 1.234]], + [[1.233, 1.234, np.nan, 3.141]], + ), # duplicate nans + ([[1, 1, 2, 2, None, None]], [[1, 2, None]]), # duplicate nulls + ( + [[1.233, np.nan, None, 1.234, 3.141, np.nan, 1.234, None]], + [[1.233, 1.234, np.nan, None, 3.141]], + ), # duplicate nans and nulls + ([[2, None, 1, None, 2]], [[1, 2, None]]), + ([[], []], [[], []]), + ([[], None], [[], None]), + ], +) +def test_unique(data, expected): + """ + Pandas de-duplicates nans and nulls respectively in Series.unique. + `expected` is setup to mimic such behavior + """ + gs = cudf.Series(data, nan_as_null=False) + + got = gs.list.unique() + expected = cudf.Series(expected, nan_as_null=False).list.sort_values() + + got = got.list.sort_values() + + assert_eq(expected, got) + + def key_func_builder(x, na_position): if x is None: if na_position == "first": From c99fcef41bea8f063953b53bd68b096ec501081c Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Wed, 31 Mar 2021 08:32:28 -0500 Subject: [PATCH 03/59] Fix type dispatch for columnar replace_nulls (#7768) Fixes #7766 Fixes a type dispatch problem where cudf::replace_nulls was not dispatching on the appropriate type, causing a "No specialization exists for the given type" to be thrown when using its columnar form with fixed-point types. Authors: - Jason Lowe (@jlowe) Approvers: - Mike Wilson (@hyperbolic2346) - Jake Hemstad (@jrhemstad) URL: https://github.com/rapidsai/cudf/pull/7768 --- cpp/src/replace/nulls.cu | 2 +- cpp/tests/replace/replace_nulls_tests.cpp | 148 +++++++++++++++++++++- 2 files changed, 148 insertions(+), 2 deletions(-) diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index afc2bbb37bd..65750deaa57 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -426,7 +426,7 @@ std::unique_ptr replace_nulls(cudf::column_view const& input, if (input.is_empty()) { return cudf::empty_like(input); } if (!input.has_nulls()) { return std::make_unique(input); } - return cudf::type_dispatcher( + return cudf::type_dispatcher( input.type(), replace_nulls_column_kernel_forwarder{}, input, replacement, stream, mr); } diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp index bd3bf7ddd03..e969f53609e 100644 --- a/cpp/tests/replace/replace_nulls_tests.cpp +++ b/cpp/tests/replace/replace_nulls_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2019, NVIDIA CORPORATION. + * Copyright 2019-2021, NVIDIA CORPORATION. * * Copyright 2018 BlazingDB, Inc. * Copyright 2018 Alexander Ocsa @@ -23,6 +23,7 @@ #include #include +#include #include #include #include @@ -437,6 +438,151 @@ TYPED_TEST(ReplaceNullsPolicyTest, FollowingFillTrailingNulls) cudf::replace_policy::FOLLOWING); } +template +struct ReplaceNullsFixedPointTest : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(ReplaceNullsFixedPointTest, cudf::test::FixedPointTypes); + +TYPED_TEST(ReplaceNullsFixedPointTest, ReplaceColumn) +{ + auto const scale = numeric::scale_type{0}; + auto const sz = std::size_t{1000}; + auto data_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{i, scale}; + }); + auto valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 3 ? 1 : 0; }); + auto replace_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{-2, scale}; + }); + auto expected_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + int val = i % 3 ? static_cast(i) : -2; + return TypeParam{val, scale}; + }); + + ReplaceNullsColumn( + cudf::test::fixed_width_column_wrapper(data_begin, data_begin + sz, valid_begin), + cudf::test::fixed_width_column_wrapper(replace_begin, replace_begin + sz), + cudf::test::fixed_width_column_wrapper(expected_begin, expected_begin + sz)); +} + +TYPED_TEST(ReplaceNullsFixedPointTest, ReplaceColumn_Empty) +{ + ReplaceNullsColumn(cudf::test::fixed_width_column_wrapper{}, + cudf::test::fixed_width_column_wrapper{}, + cudf::test::fixed_width_column_wrapper{}); +} + +TYPED_TEST(ReplaceNullsFixedPointTest, ReplaceScalar) +{ + auto const scale = numeric::scale_type{0}; + auto const sz = std::size_t{1000}; + auto data_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{i, scale}; + }); + auto valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 3 ? 1 : 0; }); + auto expected_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + int val = i % 3 ? static_cast(i) : -2; + return TypeParam{val, scale}; + }); + + cudf::fixed_point_scalar replacement{-2, scale}; + + ReplaceNullsScalar( + cudf::test::fixed_width_column_wrapper(data_begin, data_begin + sz, valid_begin), + replacement, + cudf::test::fixed_width_column_wrapper(expected_begin, expected_begin + sz)); +} + +TYPED_TEST(ReplaceNullsFixedPointTest, ReplacementHasNulls) +{ + auto const scale = numeric::scale_type{0}; + auto const sz = std::size_t{1000}; + auto data_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{i, scale}; + }); + auto data_valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 3 ? 1 : 0; }); + auto replace_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return TypeParam{-2, scale}; + }); + auto replace_valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 2 ? 1 : 0; }); + auto expected_begin = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + int val = i % 3 ? static_cast(i) : -2; + return TypeParam{val, scale}; + }); + auto expected_valid_begin = + cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return i % 6 ? 1 : 0; }); + + ReplaceNullsColumn(cudf::test::fixed_width_column_wrapper( + data_begin, data_begin + sz, data_valid_begin), + cudf::test::fixed_width_column_wrapper( + replace_begin, replace_begin + sz, replace_valid_begin), + cudf::test::fixed_width_column_wrapper( + expected_begin, expected_begin + sz, expected_valid_begin)); +} + +template +struct ReplaceNullsPolicyFixedPointTest : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(ReplaceNullsPolicyFixedPointTest, cudf::test::FixedPointTypes); + +TYPED_TEST(ReplaceNullsPolicyFixedPointTest, PrecedingFill) +{ + using fp = TypeParam; + auto const s = numeric::scale_type{0}; + auto col = cudf::test::fixed_width_column_wrapper( + {fp{42, s}, fp{2, s}, fp{1, s}, fp{-10, s}, fp{20, s}, fp{-30, s}}, {1, 0, 0, 1, 0, 1}); + auto expect_col = cudf::test::fixed_width_column_wrapper( + {fp{42, s}, fp{42, s}, fp{42, s}, fp{-10, s}, fp{-10, s}, fp{-30, s}}, {1, 1, 1, 1, 1, 1}); + + TestReplaceNullsWithPolicy( + std::move(col), std::move(expect_col), cudf::replace_policy::PRECEDING); +} + +TYPED_TEST(ReplaceNullsPolicyFixedPointTest, FollowingFill) +{ + using fp = TypeParam; + auto const s = numeric::scale_type{0}; + auto col = cudf::test::fixed_width_column_wrapper( + {fp{42, s}, fp{2, s}, fp{1, s}, fp{-10, s}, fp{20, s}, fp{-30, s}}, {1, 0, 0, 1, 0, 1}); + auto expect_col = cudf::test::fixed_width_column_wrapper( + {fp{42, s}, fp{-10, s}, fp{-10, s}, fp{-10, s}, fp{-30, s}, fp{-30, s}}, {1, 1, 1, 1, 1, 1}); + + TestReplaceNullsWithPolicy( + std::move(col), std::move(expect_col), cudf::replace_policy::FOLLOWING); +} + +TYPED_TEST(ReplaceNullsPolicyFixedPointTest, PrecedingFillLeadingNulls) +{ + using fp = TypeParam; + auto const s = numeric::scale_type{0}; + auto col = cudf::test::fixed_width_column_wrapper( + {fp{1, s}, fp{2, s}, fp{3, s}, fp{4, s}, fp{5, s}}, {0, 0, 1, 0, 1}); + auto expect_col = cudf::test::fixed_width_column_wrapper( + {fp{1, s}, fp{2, s}, fp{3, s}, fp{3, s}, fp{5, s}}, {0, 0, 1, 1, 1}); + + TestReplaceNullsWithPolicy( + std::move(col), std::move(expect_col), cudf::replace_policy::PRECEDING); +} + +TYPED_TEST(ReplaceNullsPolicyFixedPointTest, FollowingFillTrailingNulls) +{ + using fp = TypeParam; + auto const s = numeric::scale_type{0}; + auto col = cudf::test::fixed_width_column_wrapper( + {fp{1, s}, fp{2, s}, fp{3, s}, fp{4, s}, fp{5, s}}, {1, 0, 1, 0, 0}); + auto expect_col = cudf::test::fixed_width_column_wrapper( + {fp{1, s}, fp{3, s}, fp{3, s}, fp{4, s}, fp{5, s}}, {1, 1, 1, 0, 0}); + + TestReplaceNullsWithPolicy( + std::move(col), std::move(expect_col), cudf::replace_policy::FOLLOWING); +} + struct ReplaceDictionaryTest : public cudf::test::BaseFixture { }; From be2f0c000f2455a42d299f959e9e816b381ec315 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Wed, 31 Mar 2021 09:19:28 -0500 Subject: [PATCH 04/59] Fix Java explode outer unit tests (#7782) After #7754 the Java explode outer unit tests were not updated to expect the nulls. Authors: - Jason Lowe (@jlowe) Approvers: - Robert (Bobby) Evans (@revans2) URL: https://github.com/rapidsai/cudf/pull/7782 --- java/src/test/java/ai/rapids/cudf/TableTest.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 9c67966c16c..8b7ece5d60b 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -5067,7 +5067,7 @@ private Table[] buildExplodeTestTableWithPrimitiveTypes(boolean pos, boolean out .build()) { Table.TestBuilder expectedBuilder = new Table.TestBuilder(); if (pos) { - Integer[] posData = outer ? new Integer[]{0, 1, 2, 0, 1, 0, 0, 0} : new Integer[]{0, 1, 2, 0, 1, 0}; + Integer[] posData = outer ? new Integer[]{0, 1, 2, 0, 1, 0, null, null} : new Integer[]{0, 1, 2, 0, 1, 0}; expectedBuilder.column(posData); } List expectedData = new ArrayList(){{ @@ -5109,10 +5109,11 @@ private Table[] buildExplodeTestTableWithNestedTypes(boolean pos, boolean outer) .build()) { Table.TestBuilder expectedBuilder = new Table.TestBuilder(); if (pos) { - if (!outer) + if (outer) { + expectedBuilder.column(0, 1, 2, 0, 1, 0, null, null); + } else { expectedBuilder.column(0, 1, 2, 0, 1, 0, 0); - else - expectedBuilder.column(0, 1, 2, 0, 1, 0, 0, 0); + } } List expectedData = new ArrayList(){{ if (!outer) { From b9371122eacf8c1376f0185df409e906d7b3c4e5 Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Wed, 31 Mar 2021 09:57:11 -0500 Subject: [PATCH 05/59] get_json_object() implementation (#7286) An implementation of get_json_object(). Reference: https://cwiki.apache.org/confluence/display/Hive/LanguageManual+UDF#LanguageManualUDF-get_json_object The fundamental functionality here is running a JSONPath query on each row in an input column of json strings. JSONPath spec: https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html For review purposes, the key entry point is `parse_json_path()`. Each thread of the kernel processes 1 row via this function. The behavior is recursive in nature but we maintain our own context stack to do it in loop fashion. `parse_json_path` is just the high level controlling logic, with most of the heavy lifting happening in the `json_state` parser class. Though the "heavy lifting" is pretty much just traditional string parsing code. The path to optimization here (I'll open a separate cudf issue for this) is - Change `parse_json_path` to work on a warp basis. So each row in the column would be processed by one warp. - Make the `json_state` parser class thread/warp aware (the class would just store its `tid` and operate accordingly). I think this is reasonably straightforward to do as most of the cuIO decoding kernels behave like this. Authors: - @nvdbaranec - Raza Jafri (@razajafri) Approvers: - Ray Douglass (@raydouglass) - Jason Lowe (@jlowe) - Jake Hemstad (@jrhemstad) - David (@davidwendt) URL: https://github.com/rapidsai/cudf/pull/7286 --- conda/recipes/libcudf/meta.yaml | 2 + cpp/CMakeLists.txt | 1 + cpp/benchmarks/CMakeLists.txt | 5 + cpp/benchmarks/string/json_benchmark.cpp | 140 +++ cpp/include/cudf/strings/detail/json.hpp | 40 + cpp/include/cudf/strings/json.hpp | 50 + cpp/include/doxygen_groups.h | 1 + cpp/src/io/csv/csv_gpu.cu | 6 +- cpp/src/io/json/json_gpu.cu | 4 +- cpp/src/io/utilities/parsing_utils.cuh | 144 +-- cpp/src/strings/json/json_path.cu | 952 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/strings/json_tests.cpp | 761 ++++++++++++++ cpp/tests/utilities/column_utilities.cu | 2 +- .../main/java/ai/rapids/cudf/ColumnView.java | 19 + java/src/main/native/src/ColumnViewJni.cpp | 23 + .../java/ai/rapids/cudf/ColumnVectorTest.java | 44 + 17 files changed, 2117 insertions(+), 78 deletions(-) create mode 100644 cpp/benchmarks/string/json_benchmark.cpp create mode 100644 cpp/include/cudf/strings/detail/json.hpp create mode 100644 cpp/include/cudf/strings/json.hpp create mode 100644 cpp/src/strings/json/json_path.cu create mode 100644 cpp/tests/strings/json_tests.cpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 39587b4bd05..75955428eab 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -178,12 +178,14 @@ test: - test -f $PREFIX/include/cudf/strings/detail/converters.hpp - test -f $PREFIX/include/cudf/strings/detail/copying.hpp - test -f $PREFIX/include/cudf/strings/detail/fill.hpp + - test -f $PREFIX/include/cudf/strings/detail/json.hpp - test -f $PREFIX/include/cudf/strings/detail/replace.hpp - test -f $PREFIX/include/cudf/strings/detail/utilities.hpp - test -f $PREFIX/include/cudf/strings/extract.hpp - test -f $PREFIX/include/cudf/strings/findall.hpp - test -f $PREFIX/include/cudf/strings/find.hpp - test -f $PREFIX/include/cudf/strings/find_multiple.hpp + - test -f $PREFIX/include/cudf/strings/json.hpp - test -f $PREFIX/include/cudf/strings/padding.hpp - test -f $PREFIX/include/cudf/strings/replace.hpp - test -f $PREFIX/include/cudf/strings/replace_re.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5cd82e52180..61cb13d3445 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -346,6 +346,7 @@ add_library(cudf src/strings/find.cu src/strings/find_multiple.cu src/strings/padding.cu + src/strings/json/json_path.cu src/strings/regex/regcomp.cpp src/strings/regex/regexec.cu src/strings/replace/backref_re.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 5aa7e0132f8..11af408f1c5 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -202,3 +202,8 @@ ConfigureBench(STRINGS_BENCH string/substring_benchmark.cpp string/translate_benchmark.cpp string/url_decode_benchmark.cpp) + +################################################################################################### +# - json benchmark ------------------------------------------------------------------- +ConfigureBench(JSON_BENCH + string/json_benchmark.cpp) diff --git a/cpp/benchmarks/string/json_benchmark.cpp b/cpp/benchmarks/string/json_benchmark.cpp new file mode 100644 index 00000000000..6fb6a07a8d0 --- /dev/null +++ b/cpp/benchmarks/string/json_benchmark.cpp @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +#include +#include + +class JsonPath : public cudf::benchmark { +}; + +float frand() { return static_cast(rand()) / static_cast(RAND_MAX); } + +int rand_range(int min, int max) { return min + static_cast(frand() * (max - min)); } + +std::vector Books{ + "{\n\"category\": \"reference\",\n\"author\": \"Nigel Rees\",\n\"title\": \"Sayings of the " + "Century\",\n\"price\": 8.95\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"Evelyn Waugh\",\n\"title\": \"Sword of " + "Honour\",\n\"price\": 12.99\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"Herman Melville\",\n\"title\": \"Moby " + "Dick\",\n\"isbn\": \"0-553-21311-3\",\n\"price\": 8.99\n}", + "{\n\"category\": \"fiction\",\n\"author\": \"J. R. R. Tolkien\",\n\"title\": \"The Lord of the " + "Rings\",\n\"isbn\": \"0-395-19395-8\",\n\"price\": 22.99\n}"}; +constexpr int Approx_book_size = 110; +std::vector Bicycles{ + "{\"color\": \"red\", \"price\": 9.95}", + "{\"color\": \"green\", \"price\": 29.95}", + "{\"color\": \"blue\", \"price\": 399.95}", + "{\"color\": \"yellow\", \"price\": 99.95}", + "{\"color\": \"mauve\", \"price\": 199.95}", +}; +constexpr int Approx_bicycle_size = 33; +std::string Misc{"\n\"expensive\": 10\n"}; +std::string generate_field(std::vector const& values, int num_values) +{ + std::string res; + for (int idx = 0; idx < num_values; idx++) { + if (idx > 0) { res += std::string(",\n"); } + int vindex = std::min(static_cast(floor(frand() * values.size())), + static_cast(values.size() - 1)); + res += values[vindex]; + } + return res; +} + +std::string build_row(int desired_bytes) +{ + // always have at least 2 books and 2 bikes + int num_books = 2; + int num_bicycles = 2; + int remaining_bytes = + desired_bytes - ((num_books * Approx_book_size) + (num_bicycles * Approx_bicycle_size)); + + // divide up the remainder between books and bikes + float book_pct = frand(); + float bicycle_pct = 1.0f - book_pct; + num_books += (remaining_bytes * book_pct) / Approx_book_size; + num_bicycles += (remaining_bytes * bicycle_pct) / Approx_bicycle_size; + + std::string books = "\"book\": [\n" + generate_field(Books, num_books) + "]\n"; + std::string bicycles = "\"bicycle\": [\n" + generate_field(Bicycles, num_bicycles) + "]\n"; + + std::string store = "\"store\": {\n"; + if (frand() <= 0.5f) { + store += books + std::string(",\n") + bicycles; + } else { + store += bicycles + std::string(",\n") + books; + } + store += std::string("}\n"); + + std::string row = std::string("{\n"); + if (frand() <= 0.5f) { + row += store + std::string(",\n") + Misc; + } else { + row += Misc + std::string(",\n") + store; + } + row += std::string("}\n"); + return row; +} + +template +static void BM_case(benchmark::State& state, QueryArg&&... query_arg) +{ + srand(5236); + auto iter = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + [desired_bytes = state.range(1)](int index) { return build_row(desired_bytes); }); + int num_rows = state.range(0); + cudf::test::strings_column_wrapper input(iter, iter + num_rows); + cudf::strings_column_view scv(input); + size_t num_chars = scv.chars().size(); + + std::string json_path(query_arg...); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + auto result = cudf::strings::get_json_object(scv, json_path); + cudaStreamSynchronize(0); + } + + // this isn't strictly 100% accurate. a given query isn't necessarily + // going to visit every single incoming character. but in spirit it does. + state.SetBytesProcessed(state.iterations() * num_chars); +} + +#define JSON_BENCHMARK_DEFINE(name, query) \ + BENCHMARK_CAPTURE(BM_case, name, query) \ + ->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +JSON_BENCHMARK_DEFINE(query0, "$"); +JSON_BENCHMARK_DEFINE(query1, "$.store"); +JSON_BENCHMARK_DEFINE(query2, "$.store.book"); +JSON_BENCHMARK_DEFINE(query3, "$.store.*"); +JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]"); +JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category"); +JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']"); +JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']"); +JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]"); diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp new file mode 100644 index 00000000000..e6a0b49f102 --- /dev/null +++ b/cpp/include/cudf/strings/detail/json.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +/** + * @copydoc cudf::strings::get_json_object + * + * @param stream CUDA stream used for device memory operations and kernel launches + */ +std::unique_ptr get_json_object( + cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +} // namespace detail +} // namespace strings +} // namespace cudf diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp new file mode 100644 index 00000000000..b39e4a2027c --- /dev/null +++ b/cpp/include/cudf/strings/json.hpp @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +namespace cudf { +namespace strings { + +/** + * @addtogroup strings_json + * @{ + * @file + */ + +/** + * @brief Apply a JSONPath string to all rows in an input strings column. + * + * Applies a JSONPath string to an incoming strings column where each row in the column + * is a valid json string. The output is returned by row as a strings column. + * + * https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html + * Implements only the operators: $ . [] * + * + * @param col The input strings column. Each row must contain a valid json string + * @param json_path The JSONPath string to be applied to each row + * @param mr Resource for allocating device memory. + * @return New strings column containing the retrieved json object strings + */ +std::unique_ptr get_json_object( + cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 65dd5c73475..f78ff98d49d 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -127,6 +127,7 @@ * @defgroup strings_modify Modifying * @defgroup strings_replace Replacing * @defgroup strings_split Splitting + * @defgroup strings_json JSON * @} * @defgroup dictionary_apis Dictionary * @{ diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 86e5f1fdcae..44acc7fc55f 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -196,7 +196,7 @@ __global__ void __launch_bounds__(csvparse_block_dim) } else if (serialized_trie_contains(opts.trie_true, {field_start, field_len}) || serialized_trie_contains(opts.trie_false, {field_start, field_len})) { atomicAdd(&d_columnData[actual_col].bool_count, 1); - } else if (cudf::io::gpu::is_infinity(field_start, next_delimiter)) { + } else if (cudf::io::is_infinity(field_start, next_delimiter)) { atomicAdd(&d_columnData[actual_col].float_count, 1); } else { long countNumber = 0; @@ -277,7 +277,7 @@ __inline__ __device__ T decode_value(char const *begin, char const *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } template @@ -285,7 +285,7 @@ __inline__ __device__ T decode_value(char const *begin, char const *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } template <> diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu index 5efb64fd4d5..75910ae6b5b 100644 --- a/cpp/src/io/json/json_gpu.cu +++ b/cpp/src/io/json/json_gpu.cu @@ -114,7 +114,7 @@ __inline__ __device__ T decode_value(const char *begin, uint64_t end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } /** @@ -131,7 +131,7 @@ __inline__ __device__ T decode_value(const char *begin, const char *end, parse_options_view const &opts) { - return cudf::io::gpu::parse_numeric(begin, end, opts); + return cudf::io::parse_numeric(begin, end, opts); } /** diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 584d2c9a74a..b7719cba580 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -20,6 +20,8 @@ #include #include +#include + #include using cudf::device_span; @@ -82,67 +84,6 @@ struct parse_options { } }; -namespace gpu { -/** - * @brief CUDA kernel iterates over the data until the end of the current field - * - * Also iterates over (one or more) delimiter characters after the field. - * Function applies to formats with field delimiters and line terminators. - * - * @param begin Pointer to the first element of the string - * @param end Pointer to the first element after the string - * @param opts A set of parsing options - * @param escape_char A boolean value to signify whether to consider `\` as escape character or - * just a character. - * - * @return Pointer to the last character in the field, including the - * delimiter(s) following the field data - */ -__device__ __inline__ char const* seek_field_end(char const* begin, - char const* end, - parse_options_view const& opts, - bool escape_char = false) -{ - bool quotation = false; - auto current = begin; - bool escape_next = false; - while (true) { - // Use simple logic to ignore control chars between any quote seq - // Handles nominal cases including doublequotes within quotes, but - // may not output exact failures as PANDAS for malformed fields. - // Check for instances such as "a2\"bc" and "\\" if `escape_char` is true. - - if (*current == opts.quotechar and not escape_next) { - quotation = !quotation; - } else if (!quotation) { - if (*current == opts.delimiter) { - while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) { - ++current; - } - break; - } else if (*current == opts.terminator) { - break; - } else if (*current == '\r' && (current + 1 < end && *(current + 1) == '\n')) { - --end; - break; - } - } - - if (escape_char == true) { - // If a escape character is encountered, escape next character in next loop. - if (escape_next == false and *current == '\\') { - escape_next = true; - } else { - escape_next = false; - } - } - - if (current >= end) break; - current++; - } - return current; -} - /** * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization * for integral types. Handles hexadecimal digits, both uppercase and lowercase. @@ -155,7 +96,7 @@ __device__ __inline__ char const* seek_field_end(char const* begin, * @return uint8_t Numeric value of the character, or `0` */ template ::value>* = nullptr> -__device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) +constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; @@ -176,7 +117,7 @@ __device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) * @return uint8_t Numeric value of the character, or `0` */ template ::value>* = nullptr> -__device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) +constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; @@ -185,10 +126,7 @@ __device__ __forceinline__ uint8_t decode_digit(char c, bool* valid_flag) } // Converts character to lowercase. -__inline__ __device__ char to_lower(char const c) -{ - return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; -} +constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; } /** * @brief Checks if string is infinity, case insensitive with/without sign @@ -199,7 +137,7 @@ __inline__ __device__ char to_lower(char const c) * @param end Pointer to the first element after the string * @return true if string is valid infinity, else false. */ -__inline__ __device__ bool is_infinity(char const* begin, char const* end) +constexpr bool is_infinity(char const* begin, char const* end) { if (*begin == '-' || *begin == '+') begin++; char const* cinf = "infinity"; @@ -223,9 +161,10 @@ __inline__ __device__ bool is_infinity(char const* begin, char const* end) * @return The parsed and converted value */ template -__inline__ __device__ T parse_numeric(const char* begin, - const char* end, - parse_options_view const& opts) +constexpr T parse_numeric(const char* begin, + const char* end, + parse_options_view const& opts, + T error_result = std::numeric_limits::quiet_NaN()) { T value{}; bool all_digits_valid = true; @@ -281,11 +220,72 @@ __inline__ __device__ T parse_numeric(const char* begin, if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); } } } - if (!all_digits_valid) { return std::numeric_limits::quiet_NaN(); } + if (!all_digits_valid) { return error_result; } return value * sign; } +namespace gpu { +/** + * @brief CUDA kernel iterates over the data until the end of the current field + * + * Also iterates over (one or more) delimiter characters after the field. + * Function applies to formats with field delimiters and line terminators. + * + * @param begin Pointer to the first element of the string + * @param end Pointer to the first element after the string + * @param opts A set of parsing options + * @param escape_char A boolean value to signify whether to consider `\` as escape character or + * just a character. + * + * @return Pointer to the last character in the field, including the + * delimiter(s) following the field data + */ +__device__ __inline__ char const* seek_field_end(char const* begin, + char const* end, + parse_options_view const& opts, + bool escape_char = false) +{ + bool quotation = false; + auto current = begin; + bool escape_next = false; + while (true) { + // Use simple logic to ignore control chars between any quote seq + // Handles nominal cases including doublequotes within quotes, but + // may not output exact failures as PANDAS for malformed fields. + // Check for instances such as "a2\"bc" and "\\" if `escape_char` is true. + + if (*current == opts.quotechar and not escape_next) { + quotation = !quotation; + } else if (!quotation) { + if (*current == opts.delimiter) { + while (opts.multi_delimiter && current < end && *(current + 1) == opts.delimiter) { + ++current; + } + break; + } else if (*current == opts.terminator) { + break; + } else if (*current == '\r' && (current + 1 < end && *(current + 1) == '\n')) { + --end; + break; + } + } + + if (escape_char == true) { + // If a escape character is encountered, escape next character in next loop. + if (escape_next == false and *current == '\\') { + escape_next = true; + } else { + escape_next = false; + } + } + + if (current >= end) break; + current++; + } + return current; +} + /** * @brief Lexicographically compare digits in input against string * representing an integer diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu new file mode 100644 index 00000000000..cd8aae12070 --- /dev/null +++ b/cpp/src/strings/json/json_path.cu @@ -0,0 +1,952 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +namespace { + +// debug accessibility + +// change to "\n" and 1 to make output more readable +#define DEBUG_NEWLINE +constexpr int DEBUG_NEWLINE_LEN = 0; + +/** + * @brief Result of calling a parse function. + * + * The primary use of this is to distinguish between "success" and + * "success but no data" return cases. For example, if you are reading the + * values of an array you might call a parse function in a while loop. You + * would want to continue doing this until you either encounter an error (parse_result::ERROR) + * or you get nothing back (parse_result::EMPTY) + */ +enum class parse_result { + ERROR, // failure + SUCCESS, // success + EMPTY, // success, but no data +}; + +/** + * @brief Base parser class inherited by the (device-side) json_state class and + * (host-side) path_state class. + * + * Contains a number of useful utility functions common to parsing json and + * JSONPath strings. + */ +class parser { + protected: + CUDA_HOST_DEVICE_CALLABLE parser() : input(nullptr), input_len(0), pos(nullptr) {} + CUDA_HOST_DEVICE_CALLABLE parser(const char* _input, int64_t _input_len) + : input(_input), input_len(_input_len), pos(_input) + { + parse_whitespace(); + } + + CUDA_HOST_DEVICE_CALLABLE parser(parser const& p) + : input(p.input), input_len(p.input_len), pos(p.pos) + { + } + + CUDA_HOST_DEVICE_CALLABLE bool eof(const char* p) { return p - input >= input_len; } + CUDA_HOST_DEVICE_CALLABLE bool eof() { return eof(pos); } + + CUDA_HOST_DEVICE_CALLABLE bool parse_whitespace() + { + while (!eof()) { + if (is_whitespace(*pos)) { + pos++; + } else { + return true; + } + } + return false; + } + + CUDA_HOST_DEVICE_CALLABLE parse_result parse_string(string_view& str, + bool can_be_empty, + char quote) + { + str = string_view(nullptr, 0); + + if (parse_whitespace() && *pos == quote) { + const char* start = ++pos; + while (!eof()) { + if (*pos == quote) { + str = string_view(start, pos - start); + pos++; + return parse_result::SUCCESS; + } + pos++; + } + } + + return can_be_empty ? parse_result::EMPTY : parse_result::ERROR; + } + + // a name means: + // - a string followed by a : + // - no string + CUDA_HOST_DEVICE_CALLABLE parse_result parse_name(string_view& name, + bool can_be_empty, + char quote) + { + if (parse_string(name, can_be_empty, quote) == parse_result::ERROR) { + return parse_result::ERROR; + } + + // if we got a real string, the next char must be a : + if (name.size_bytes() > 0) { + if (!parse_whitespace()) { return parse_result::ERROR; } + if (*pos == ':') { + pos++; + return parse_result::SUCCESS; + } + } + return parse_result::EMPTY; + } + + // numbers, true, false, null. + // this function is not particularly strong. badly formed values will get + // consumed without throwing any errors + CUDA_HOST_DEVICE_CALLABLE parse_result parse_non_string_value(string_view& val) + { + if (!parse_whitespace()) { return parse_result::ERROR; } + + // parse to the end of the value + char const* start = pos; + char const* end = start; + while (!eof(end)) { + char const c = *end; + if (c == ',' || c == '}' || c == ']' || is_whitespace(c)) { break; } + + // illegal chars + if (c == '[' || c == '{' || c == ':' || c == '\"') { return parse_result::ERROR; } + end++; + } + pos = end; + + val = string_view(start, end - start); + + return parse_result::SUCCESS; + } + + protected: + char const* input; + int64_t input_len; + char const* pos; + + private: + CUDA_HOST_DEVICE_CALLABLE bool is_whitespace(char c) { return c <= ' '; } +}; + +/** + * @brief Output buffer object. Used during the preprocess/size-computation step + * and the actual output step. + * + * There is an important distinction between two cases: + * + * - producing no output at all. that is, the query matched nothing in the input. + * - producing empty output. the query matched something in the input, but the + * value of the result is an empty string. + * + * The `has_output` field is the flag which indicates whether or not the output + * from the query should be considered empty or null. + * + */ +struct json_output { + size_t output_max_len; + char* output; + thrust::optional output_len; + + __device__ void add_output(const char* str, size_t len) + { + if (output != nullptr) { memcpy(output + output_len.value_or(0), str, len); } + output_len = output_len.value_or(0) + len; + } + + __device__ void add_output(string_view const& str) { add_output(str.data(), str.size_bytes()); } +}; + +enum json_element_type { NONE, OBJECT, ARRAY, VALUE }; + +/** + * @brief Parsing class that holds the current state of the json to be parse and provides + * functions for navigating through it. + */ +class json_state : private parser { + public: + __device__ json_state() + : parser(), + cur_el_start(nullptr), + cur_el_type(json_element_type::NONE), + parent_el_type(json_element_type::NONE) + { + } + __device__ json_state(const char* _input, int64_t _input_len) + : parser(_input, _input_len), + cur_el_start(nullptr), + cur_el_type(json_element_type::NONE), + parent_el_type(json_element_type::NONE) + { + } + + __device__ json_state(json_state const& j) + : parser(j), + cur_el_start(j.cur_el_start), + cur_el_type(j.cur_el_type), + parent_el_type(j.parent_el_type) + { + } + + // retrieve the entire current element into the output + __device__ parse_result extract_element(json_output* output, bool list_element) + { + char const* start = cur_el_start; + char const* end = start; + + // if we're a value type, do a simple value parse. + if (cur_el_type == VALUE) { + pos = cur_el_start; + if (parse_value() != parse_result::SUCCESS) { return parse_result::ERROR; } + end = pos; + + // SPARK-specific behavior. if this is a non-list-element wrapped in quotes, + // strip them. we may need to make this behavior configurable in some way + // later on. + if (!list_element && *start == '\"' && *(end - 1) == '\"') { + start++; + end--; + } + } + // otherwise, march through everything inside + else { + int obj_count = 0; + int arr_count = 0; + + while (!eof(end)) { + // could do some additional checks here. we know our current + // element type, so we could be more strict on what kinds of + // characters we expect to see. + switch (*end++) { + case '{': obj_count++; break; + case '}': obj_count--; break; + case '[': arr_count++; break; + case ']': arr_count--; break; + default: break; + } + if (obj_count == 0 && arr_count == 0) { break; } + } + if (obj_count > 0 || arr_count > 0) { return parse_result::ERROR; } + pos = end; + } + + // parse trailing , + if (parse_whitespace()) { + if (*pos == ',') { pos++; } + } + + if (output != nullptr) { output->add_output({start, static_cast(end - start)}); } + return parse_result::SUCCESS; + } + + // skip the next element + __device__ parse_result skip_element() { return extract_element(nullptr, false); } + + // advance to the next element + __device__ parse_result next_element() { return next_element_internal(false); } + + // advance inside the current element + __device__ parse_result child_element(json_element_type expected_type) + { + if (expected_type != NONE && cur_el_type != expected_type) { return parse_result::ERROR; } + + // if we succeed, record our parent element type. + auto const prev_el_type = cur_el_type; + auto const result = next_element_internal(true); + if (result == parse_result::SUCCESS) { parent_el_type = prev_el_type; } + return result; + } + + // return the next element that matches the specified name. + __device__ parse_result next_matching_element(string_view const& name, bool inclusive) + { + // if we're not including the current element, skip it + if (!inclusive) { + parse_result result = next_element_internal(false); + if (result != parse_result::SUCCESS) { return result; } + } + // loop until we find a match or there's nothing left + do { + // wildcard matches anything + if (name.size_bytes() == 1 && name.data()[0] == '*') { + return parse_result::SUCCESS; + } else if (cur_el_name == name) { + return parse_result::SUCCESS; + } + + // next + parse_result result = next_element_internal(false); + if (result != parse_result::SUCCESS) { return result; } + } while (1); + + return parse_result::ERROR; + } + + private: + // parse a value - either a string or a number/null/bool + __device__ parse_result parse_value() + { + if (!parse_whitespace()) { return parse_result::ERROR; } + + // string or number? + string_view unused; + return *pos == '\"' ? parse_string(unused, false, '\"') : parse_non_string_value(unused); + } + + __device__ parse_result next_element_internal(bool child) + { + // if we're not getting a child element, skip the current element. + // this will leave pos as the first character -after- the close of + // the current element + if (!child && cur_el_start != nullptr) { + if (skip_element() == parse_result::ERROR) { return parse_result::ERROR; } + cur_el_start = nullptr; + } + // otherwise pos will be at the first character within the current element + + // can only get the child of an object or array. + // this could theoretically be handled as an error, but the evaluators I've found + // seem to treat this as "it's nothing" + if (child && (cur_el_type == VALUE || cur_el_type == NONE)) { return parse_result::EMPTY; } + + // what's next + if (!parse_whitespace()) { return parse_result::EMPTY; } + // if we're closing off a parent element, we're done + char const c = *pos; + if (c == ']' || c == '}') { return parse_result::EMPTY; } + + // if we're not accessing elements of an array, check for name. + bool const array_access = + (cur_el_type == ARRAY && child) || (parent_el_type == ARRAY && !child); + if (!array_access && parse_name(cur_el_name, true, '\"') == parse_result::ERROR) { + return parse_result::ERROR; + } + + // element type + if (!parse_whitespace()) { return parse_result::EMPTY; } + switch (*pos++) { + case '[': cur_el_type = ARRAY; break; + case '{': cur_el_type = OBJECT; break; + + case ',': + case ':': + case '\'': return parse_result::ERROR; + + // value type + default: cur_el_type = VALUE; break; + } + + // the start of the current element is always at the value, not the name + cur_el_start = pos - 1; + return parse_result::SUCCESS; + } + + const char* cur_el_start; // pointer to the first character of the -value- of the current + // element - not the name + string_view cur_el_name; // name of the current element (if applicable) + json_element_type cur_el_type; // type of the current element + json_element_type parent_el_type; // parent element type +}; + +enum class path_operator_type { ROOT, CHILD, CHILD_WILDCARD, CHILD_INDEX, ERROR, END }; + +/** + * @brief A "command" operator used to query a json string. A full query is + * an array of these operators applied to the incoming json string, + */ +struct path_operator { + CUDA_HOST_DEVICE_CALLABLE path_operator() + : type(path_operator_type::ERROR), index(-1), expected_type{NONE} + { + } + CUDA_HOST_DEVICE_CALLABLE path_operator(path_operator_type _type, + json_element_type _expected_type = NONE) + : type(_type), index(-1), expected_type{_expected_type} + { + } + + path_operator_type type; // operator type + // the expected element type we're applying this operation to. + // for example: + // - you cannot retrieve a subscripted field (eg [5]) from an object. + // - you cannot retrieve a field by name (eg .book) from an array. + // - you -can- use .* for both arrays and objects + // a value of NONE imples any type accepted + json_element_type expected_type; // the expected type of the element we're working with + string_view name; // name to match against (if applicable) + int index; // index for subscript operator +}; + +/** + * @brief Parsing class that holds the current state of the JSONPath string to be parsed + * and provides functions for navigating through it. This is only called on the host + * during the preprocess step which builds a command buffer that the gpu uses. + */ +class path_state : private parser { + public: + path_state(const char* _path, size_t _path_len) : parser(_path, _path_len) {} + + // get the next operator in the JSONPath string + path_operator get_next_operator() + { + if (eof()) { return {path_operator_type::END}; } + + switch (*pos++) { + case '$': return {path_operator_type::ROOT}; + + case '.': { + path_operator op; + string_view term{".[", 2}; + if (parse_path_name(op.name, term)) { + // this is another potential use case for __SPARK_BEHAVIORS / configurability + // Spark currently only handles the wildcard operator inside [*], it does + // not handle .* + if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') { + op.type = path_operator_type::CHILD_WILDCARD; + op.expected_type = NONE; + } else { + op.type = path_operator_type::CHILD; + op.expected_type = OBJECT; + } + return op; + } + } break; + + // 3 ways this can be used + // indices: [0] + // name: ['book'] + // wildcard: [*] + case '[': { + path_operator op; + string_view term{"]", 1}; + bool const is_string = *pos == '\'' ? true : false; + if (parse_path_name(op.name, term)) { + pos++; + if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') { + op.type = path_operator_type::CHILD_WILDCARD; + op.expected_type = NONE; + } else { + if (is_string) { + op.type = path_operator_type::CHILD; + op.expected_type = OBJECT; + } else { + op.type = path_operator_type::CHILD_INDEX; + op.index = cudf::io::parse_numeric( + op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1); + CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath"); + op.expected_type = ARRAY; + } + } + return op; + } + } break; + + // wildcard operator + case '*': { + pos++; + return path_operator{path_operator_type::CHILD_WILDCARD}; + } break; + + default: CUDF_FAIL("Unrecognized JSONPath operator"); break; + } + return {path_operator_type::ERROR}; + } + + private: + cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'}; + + bool parse_path_name(string_view& name, string_view const& terminators) + { + switch (*pos) { + case '*': + name = string_view(pos, 1); + pos++; + break; + + case '\'': + if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; } + break; + + default: { + size_t const chars_left = input_len - (pos - input); + char const* end = std::find_first_of( + pos, pos + chars_left, terminators.data(), terminators.data() + terminators.size_bytes()); + if (end) { + name = string_view(pos, end - pos); + pos = end; + } else { + name = string_view(pos, chars_left); + pos = input + input_len; + } + break; + } + } + + // an empty name is not valid + CUDF_EXPECTS(name.size_bytes() > 0, "Invalid empty name in JSONPath query string"); + + return true; + } +}; + +/** + * @brief Preprocess the incoming JSONPath string on the host to generate a + * command buffer for use by the GPU. + * + * @param json_path The incoming json path + * @param stream Cuda stream to perform any gpu actions on + * @returns A pair containing the command buffer, and maximum stack depth required. + */ +std::pair>, int> build_command_buffer( + cudf::string_scalar const& json_path, rmm::cuda_stream_view stream) +{ + std::string h_json_path = json_path.to_string(stream); + path_state p_state(h_json_path.data(), static_cast(h_json_path.size())); + + std::vector h_operators; + + path_operator op; + int max_stack_depth = 1; + do { + op = p_state.get_next_operator(); + if (op.type == path_operator_type::ERROR) { + CUDF_FAIL("Encountered invalid JSONPath input string"); + } + if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; } + // convert pointer to device pointer + if (op.name.size_bytes() > 0) { + op.name = + string_view(json_path.data() + (op.name.data() - h_json_path.data()), op.name.size_bytes()); + } + if (op.type == path_operator_type::ROOT) { + CUDF_EXPECTS(h_operators.size() == 0, "Root operator ($) can only exist at the root"); + } + // if we havent' gotten a root operator to start, and we're not empty, quietly push a + // root operator now. + if (h_operators.size() == 0 && op.type != path_operator_type::ROOT && + op.type != path_operator_type::END) { + h_operators.push_back(path_operator{path_operator_type::ROOT}); + } + h_operators.push_back(op); + } while (op.type != path_operator_type::END); + + auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END; + return is_empty + ? std::make_pair(thrust::nullopt, 0) + : std::make_pair( + thrust::make_optional(cudf::detail::make_device_uvector_sync(h_operators, stream)), + max_stack_depth); +} + +#define PARSE_TRY(_x) \ + do { \ + last_result = _x; \ + if (last_result == parse_result::ERROR) { return parse_result::ERROR; } \ + } while (0) + +/** + * @brief Parse a single json string using the provided command buffer + * + * @param j_state The incoming json string and associated parser + * @param commands The command buffer to be applied to the string. Always ends with a + * path_operator_type::END + * @param output Buffer user to store the results of the query + * @returns A result code indicating success/fail/empty. + */ +template +__device__ parse_result parse_json_path(json_state& j_state, + path_operator const* commands, + json_output& output) +{ + // manually maintained context stack in lieu of calling parse_json_path recursively. + struct context { + json_state j_state; + path_operator const* commands; + bool list_element; + bool state_flag; + }; + context stack[max_command_stack_depth]; + int stack_pos = 0; + auto push_context = [&stack, &stack_pos](json_state const& _j_state, + path_operator const* _commands, + bool _list_element = false, + bool _state_flag = false) { + if (stack_pos == max_command_stack_depth - 1) { return false; } + stack[stack_pos++] = context{_j_state, _commands, _list_element, _state_flag}; + return true; + }; + auto pop_context = [&stack, &stack_pos](context& c) { + if (stack_pos > 0) { + c = stack[--stack_pos]; + return true; + } + return false; + }; + push_context(j_state, commands, false); + + parse_result last_result = parse_result::SUCCESS; + context ctx; + int element_count = 0; + while (pop_context(ctx)) { + path_operator op = *ctx.commands; + + switch (op.type) { + // whatever the first object is + case path_operator_type::ROOT: + PARSE_TRY(ctx.j_state.next_element()); + push_context(ctx.j_state, ctx.commands + 1); + break; + + // .name + // ['name'] + // [1] + // will return a single thing + case path_operator_type::CHILD: { + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); + if (last_result == parse_result::SUCCESS) { + PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); + if (last_result == parse_result::SUCCESS) { + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + } + } + } break; + + // .* + // [*] + // will return an array of things + case path_operator_type::CHILD_WILDCARD: { + // if we're on the first element of this wildcard + if (!ctx.state_flag) { + // we will only ever be returning 1 array + if (!ctx.list_element) { output.add_output({"[" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); } + + // step into the child element + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); + if (last_result == parse_result::EMPTY) { + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + last_result = parse_result::SUCCESS; + break; + } + + // first element + PARSE_TRY(ctx.j_state.next_matching_element({"*", 1}, true)); + if (last_result == parse_result::EMPTY) { + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + last_result = parse_result::SUCCESS; + break; + } + + // re-push ourselves + push_context(ctx.j_state, ctx.commands, ctx.list_element, true); + // push the next command + push_context(ctx.j_state, ctx.commands + 1, true); + } else { + // next element + PARSE_TRY(ctx.j_state.next_matching_element({"*", 1}, false)); + if (last_result == parse_result::EMPTY) { + if (!ctx.list_element) { + output.add_output({"]" DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + last_result = parse_result::SUCCESS; + break; + } + + // re-push ourselves + push_context(ctx.j_state, ctx.commands, ctx.list_element, true); + // push the next command + push_context(ctx.j_state, ctx.commands + 1, true); + } + } break; + + // [0] + // [1] + // etc + // returns a single thing + case path_operator_type::CHILD_INDEX: { + PARSE_TRY(ctx.j_state.child_element(op.expected_type)); + if (last_result == parse_result::SUCCESS) { + string_view const any{"*", 1}; + PARSE_TRY(ctx.j_state.next_matching_element(any, true)); + if (last_result == parse_result::SUCCESS) { + int idx; + for (idx = 1; idx <= op.index; idx++) { + PARSE_TRY(ctx.j_state.next_matching_element(any, false)); + if (last_result == parse_result::EMPTY) { break; } + } + // if we didn't end up at the index we requested, this is an invalid index + if (idx - 1 != op.index) { return parse_result::ERROR; } + push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + } + } + } break; + + // some sort of error. + case path_operator_type::ERROR: return parse_result::ERROR; break; + + // END case + default: { + if (ctx.list_element && element_count > 0) { + output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + PARSE_TRY(ctx.j_state.extract_element(&output, ctx.list_element)); + if (ctx.list_element && last_result != parse_result::EMPTY) { element_count++; } + } break; + } + } + + return parse_result::SUCCESS; +} + +// hardcoding this for now. to reach a stack depth of 8 would require +// a JSONPath containing 7 nested wildcards so this is probably reasonable. +constexpr int max_command_stack_depth = 8; + +/** + * @brief Parse a single json string using the provided command buffer + * + * This function exists primarily as a shim for debugging purposes. + * + * @param input The incoming json string + * @param input_len Size of the incoming json string + * @param commands The command buffer to be applied to the string. Always ends with a + * path_operator_type::END + * @param out_buf Buffer user to store the results of the query (nullptr in the size computation + * step) + * @param out_buf_size Size of the output buffer + * @returns A pair containing the result code the output buffer. + */ +__device__ thrust::pair get_json_object_single( + char const* input, + size_t input_len, + path_operator const* const commands, + char* out_buf, + size_t out_buf_size) +{ + json_state j_state(input, input_len); + json_output output{out_buf_size, out_buf}; + + auto const result = parse_json_path(j_state, commands, output); + + return {result, output}; +} + +/** + * @brief Kernel for running the JSONPath query. + * + * This kernel operates in a 2-pass way. On the first pass, it computes + * output sizes. On the second pass it fills in the provided output buffers + * (chars and validity) + * + * @param col Device view of the incoming string + * @param commands JSONPath command buffer + * @param output_offsets Buffer used to store the string offsets for the results of the query + * @param out_buf Buffer used to store the results of the query + * @param out_validity Output validity buffer + * @param out_valid_count Output count of # of valid bits + */ +template +__launch_bounds__(block_size) __global__ + void get_json_object_kernel(column_device_view col, + path_operator const* const commands, + offset_type* output_offsets, + thrust::optional out_buf, + thrust::optional out_validity, + thrust::optional out_valid_count) +{ + size_type tid = threadIdx.x + (blockDim.x * blockIdx.x); + size_type stride = blockDim.x * gridDim.x; + + if (out_valid_count.has_value()) { *(out_valid_count.value()) = 0; } + size_type warp_valid_count{0}; + + auto active_threads = __ballot_sync(0xffffffff, tid < col.size()); + while (tid < col.size()) { + bool is_valid = false; + string_view const str = col.element(tid); + size_type output_size = 0; + if (str.size_bytes() > 0) { + char* dst = out_buf.has_value() ? out_buf.value() + output_offsets[tid] : nullptr; + size_t const dst_size = + out_buf.has_value() ? output_offsets[tid + 1] - output_offsets[tid] : 0; + + parse_result result; + json_output out; + thrust::tie(result, out) = + get_json_object_single(str.data(), str.size_bytes(), commands, dst, dst_size); + output_size = out.output_len.value_or(0); + if (out.output_len.has_value() && result == parse_result::SUCCESS) { is_valid = true; } + } + + // filled in only during the precompute step. during the compute step, the offsets + // are fed back in so we do -not- want to write them out + if (!out_buf.has_value()) { output_offsets[tid] = static_cast(output_size); } + + // validity filled in only during the output step + if (out_validity.has_value()) { + uint32_t mask = __ballot_sync(active_threads, is_valid); + // 0th lane of the warp writes the validity + if (!(tid % cudf::detail::warp_size)) { + out_validity.value()[cudf::word_index(tid)] = mask; + warp_valid_count += __popc(mask); + } + } + + tid += stride; + active_threads = __ballot_sync(active_threads, tid < col.size()); + } + + // sum the valid counts across the whole block + if (out_valid_count) { + size_type block_valid_count = + cudf::detail::single_lane_block_sum_reduce(warp_valid_count); + if (threadIdx.x == 0) { atomicAdd(out_valid_count.value(), block_valid_count); } + } +} + +/** + * @copydoc cudf::strings::detail::get_json_object + */ +std::unique_ptr get_json_object(cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // preprocess the json_path into a command buffer + auto preprocess = build_command_buffer(json_path, stream); + CUDF_EXPECTS(std::get<1>(preprocess) <= max_command_stack_depth, + "Encountered JSONPath string that is too complex"); + + // allocate output offsets buffer. + auto offsets = cudf::make_fixed_width_column( + data_type{type_id::INT32}, col.size() + 1, mask_state::UNALLOCATED, stream, mr); + cudf::mutable_column_view offsets_view(*offsets); + + // if the query is empty, return a string column containing all nulls + if (!std::get<0>(preprocess).has_value()) { + return std::make_unique( + data_type{type_id::STRING}, + col.size(), + rmm::device_buffer{0, stream, mr}, // no data + cudf::detail::create_null_mask(col.size(), mask_state::ALL_NULL, stream, mr), + col.size()); // null count + } + + constexpr int block_size = 512; + cudf::detail::grid_1d const grid{col.size(), block_size}; + + auto cdv = column_device_view::create(col.parent(), stream); + + // preprocess sizes (returned in the offsets buffer) + get_json_object_kernel + <<>>( + *cdv, + std::get<0>(preprocess).value().data(), + offsets_view.head(), + thrust::nullopt, + thrust::nullopt, + thrust::nullopt); + + // convert sizes to offsets + thrust::exclusive_scan(rmm::exec_policy(stream), + offsets_view.head(), + offsets_view.head() + col.size() + 1, + offsets_view.head(), + 0); + size_type const output_size = + cudf::detail::get_value(offsets_view, col.size(), stream); + + // allocate output string column + auto chars = cudf::make_fixed_width_column( + data_type{type_id::INT8}, output_size, mask_state::UNALLOCATED, stream, mr); + + // potential optimization : if we know that all outputs are valid, we could skip creating + // the validity mask altogether + rmm::device_buffer validity = + cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr); + + // compute results + cudf::mutable_column_view chars_view(*chars); + rmm::device_scalar d_valid_count{0, stream}; + get_json_object_kernel + <<>>( + *cdv, + std::get<0>(preprocess).value().data(), + offsets_view.head(), + chars_view.head(), + static_cast(validity.data()), + d_valid_count.data()); + + return make_strings_column(col.size(), + std::move(offsets), + std::move(chars), + col.size() - d_valid_count.value(), + std::move(validity), + stream, + mr); +} + +} // namespace +} // namespace detail + +/** + * @copydoc cudf::strings::get_json_object + */ +std::unique_ptr get_json_object(cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::get_json_object(col, json_path, 0, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 082f039054e..f9904dda49e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -334,6 +334,7 @@ ConfigureTest(STRINGS_TEST strings/hash_string.cu strings/integers_tests.cu strings/ipv4_tests.cpp + strings/json_tests.cpp strings/pad_tests.cpp strings/replace_regex_tests.cpp strings/replace_tests.cpp diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp new file mode 100644 index 00000000000..44eb35d4163 --- /dev/null +++ b/cpp/tests/strings/json_tests.cpp @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +// reference: https://jsonpath.herokuapp.com/ + +// clang-format off +std::string json_string{ + "{" + "\"store\": {""\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}," + "\"expensive\": 10" + "}" +}; +// clang-format on + +std::unique_ptr drop_whitespace(cudf::column_view const& col) +{ + cudf::test::strings_column_wrapper whitespace{"\n", "\r", "\t"}; + cudf::test::strings_column_wrapper repl{"", "", ""}; + + cudf::strings_column_view strings(col); + cudf::strings_column_view targets(whitespace); + cudf::strings_column_view replacements(repl); + return cudf::strings::replace(strings, targets, replacements); +} + +struct JsonTests : public cudf::test::BaseFixture { +}; + +TEST_F(JsonTests, GetJsonObjectRootOp) +{ + // root + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + auto expected = drop_whitespace(input); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); +} + +TEST_F(JsonTests, GetJsonObjectChildOp) +{ + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectWildcardOp) +{ + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.*"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "{" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("*"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "{" + "\"book\": [" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]," + "\"bicycle\": {" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + "}," + "10" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectSubscriptOp) +{ + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[2]"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store['bicycle']"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "{" + "\"color\": \"red\"," + "\"price\": 19.95" + "}" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*]"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + // clang-format off + cudf::test::strings_column_wrapper expected_raw{ + "[" + "{" + "\"category\": \"reference\"," + "\"author\": \"Nigel Rees\"," + "\"title\": \"Sayings of the Century\"," + "\"price\": 8.95" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Evelyn Waugh\"," + "\"title\": \"Sword of Honour\"," + "\"price\": 12.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"Herman Melville\"," + "\"title\": \"Moby Dick\"," + "\"isbn\": \"0-553-21311-3\"," + "\"price\": 8.99" + "}," + "{" + "\"category\": \"fiction\"," + "\"author\": \"J. R. R. Tolkien\"," + "\"title\": \"The Lord of the Rings\"," + "\"isbn\": \"0-395-19395-8\"," + "\"price\": 22.99" + "}" + "]" + }; + // clang-format on + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectFilter) +{ + // queries that result in filtering/collating results (mostly meaning - generates new + // json instead of just returning parts of the existing string + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*]['isbn']"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"[\"0-553-21311-3\",\"0-395-19395-8\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*].category"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[\"reference\",\"fiction\",\"fiction\",\"fiction\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[*].title"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{ + "[\"Sayings of the Century\",\"Sword of Honour\",\"Moby Dick\",\"The Lord of the Rings\"]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book.*.price"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"[8.95,12.99,8.99,22.99]"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } + + { + // spark behavioral difference. + // standard: "fiction" + // spark: fiction + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[2].category"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw{"fiction"}; + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectNullInputs) +{ + { + std::string str("{\"a\" : \"b\"}"); + cudf::test::strings_column_wrapper input({str, str, str, str}, {1, 0, 1, 0}); + + std::string json_path("$.a"); + auto result_raw = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + auto result = drop_whitespace(*result_raw); + + cudf::test::strings_column_wrapper expected_raw({"b", "", "b", ""}, {1, 0, 1, 0}); + auto expected = drop_whitespace(expected_raw); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, *expected); + } +} + +TEST_F(JsonTests, GetJsonObjectEmptyQuery) +{ + // empty query -> null + { + cudf::test::strings_column_wrapper input{"{\"a\" : \"b\"}"}; + std::string json_path(""); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +TEST_F(JsonTests, GetJsonObjectEmptyInputsAndOutputs) +{ + // empty input -> null + { + cudf::test::strings_column_wrapper input{""}; + std::string json_path("$"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // slightly different from "empty output". in this case, we're + // returning something, but it happens to be empty. so we expect + // a valid, but empty row + { + cudf::test::strings_column_wrapper input{"{\"store\": { \"bicycle\" : \"\" } }"}; + std::string json_path("$.store.bicycle"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +// badly formed JSONpath strings +TEST_F(JsonTests, GetJsonObjectIllegalQuery) +{ + // can't have more than one root operator, or a root operator anywhere other + // than the beginning + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$$"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // invalid index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[auh46h-]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // invalid index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[[]]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // negative index + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[-1]"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + // child operator with no name specified + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("."); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("]["); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } + + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("6hw6,56i3"); + auto query = [&]() { + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + }; + EXPECT_THROW(query(), cudf::logic_error); + } +} + +// queries that are legal, but reference invalid parts of the input +TEST_F(JsonTests, GetJsonObjectInvalidQuery) +{ + // non-existent field + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[*].c"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // non-existent field + { + cudf::test::strings_column_wrapper input{"{\"a\": \"b\"}"}; + std::string json_path("$[*].c[2]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // non-existent field + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book.price"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + // out of bounds index + { + cudf::test::strings_column_wrapper input{json_string}; + std::string json_path("$.store.book[4]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + cudf::test::strings_column_wrapper expected({""}, {0}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} + +TEST_F(JsonTests, MixedOutput) +{ + // various queries on: + // clang-format off + std::vector input_strings { + "{\"a\": {\"b\" : \"c\"}}", + + "{" + "\"a\": {\"b\" : \"c\"}," + "\"d\": [{\"e\":123}, {\"f\":-10}]" + "}", + + "{" + "\"b\": 123" + "}", + + "{" + "\"a\": [\"y\",500]" + "}", + + "{" + "\"a\": \"\"" + "}", + + "{" + "\"a\": {" + "\"z\": {\"i\": 10, \"j\": 100}," + "\"b\": [\"c\",null,true,-1]" + "}" + "}" + }; + // clang-format on + cudf::test::strings_column_wrapper input(input_strings.begin(), input_strings.end()); + { + std::string json_path("$.a"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "{\"b\" : \"c\"}", + "{\"b\" : \"c\"}", + "", + "[\"y\",500]", + "", + "{" + "\"z\": {\"i\": 10, \"j\": 100}," + "\"b\": [\"c\",null,true,-1]" + "}" + }, + {1, 1, 0, 1, 1, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a[1]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "", + "", + "", + "500", + "", + "", + }, + {0, 0, 0, 1, 0, 0}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a.b"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "c", + "c", + "", + "", + "", + "[\"c\",null,true,-1]"}, + {1, 1, 0, 0, 0, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a[*]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "[\"c\"]", + "[\"c\"]", + "", + "[\"y\",500]", + "[]", + "[" + "{\"i\": 10, \"j\": 100}," + "[\"c\",null,true,-1]" + "]" }, + {1, 1, 0, 1, 1, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } + + { + std::string json_path("$.a.b[*]"); + auto result = cudf::strings::get_json_object(cudf::strings_column_view(input), json_path); + + // clang-format off + cudf::test::strings_column_wrapper expected({ + "[]", + "[]", + "", + "", + "", + "[\"c\",null,true,-1]"}, + {1, 1, 0, 0, 0, 1}); + // clang-format on + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); + } +} diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index 78a67464654..a54c86405a5 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -71,7 +71,7 @@ struct column_property_comparator { // equivalent, but not exactly equal columns can have a different number of children if their // sizes are both 0. Specifically, empty string columns may or may not have children. - if (check_exact_equality || lhs.size() > 0) { + if (check_exact_equality || (lhs.size() > 0 && lhs.null_count() < lhs.size())) { EXPECT_EQ(lhs.num_children(), rhs.num_children()); } } diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java index 5d869ab75fb..402c64dd83d 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnView.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java @@ -2083,6 +2083,23 @@ public final ColumnVector substring(ColumnView start, ColumnView end) { return new ColumnVector(substringColumn(getNativeView(), start.getNativeView(), end.getNativeView())); } + /** + * Apply a JSONPath string to all rows in an input strings column. + * + * Applies a JSONPath string to an incoming strings column where each row in the column + * is a valid json string. The output is returned by row as a strings column. + * + * For reference, https://tools.ietf.org/id/draft-goessner-dispatch-jsonpath-00.html + * Note: Only implements the operators: $ . [] * + * + * @param path The JSONPath string to be applied to each row + * @return new strings ColumnVector containing the retrieved json object strings + */ + public final ColumnVector getJSONObject(Scalar path) { + assert(type.equals(DType.STRING)) : "column type must be a String"; + return new ColumnVector(getJSONObject(getNativeView(), path.getScalarHandle())); + } + /** * Returns a new strings column where target string within each string is replaced with the specified * replacement string. @@ -2649,6 +2666,8 @@ static DeviceMemoryBufferView getOffsetsBuffer(long viewHandle) { */ private static native long stringTimestampToTimestamp(long viewHandle, int unit, String format); + private static native long getJSONObject(long viewHandle, long scalarHandle) throws CudfException; + /** * Native method to parse and convert a timestamp column vector to string column vector. A unix * timestamp is a long value representing how many units since 1970-01-01 00:00:00:000 in either diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp index dc1acc50b5f..cec3a1a92a6 100644 --- a/java/src/main/native/src/ColumnViewJni.cpp +++ b/java/src/main/native/src/ColumnViewJni.cpp @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +66,8 @@ #include "cudf_jni_apis.hpp" #include "dtype_utils.hpp" +#include "jni.h" +#include "jni_utils.hpp" namespace { @@ -1835,4 +1838,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv } CATCH_STD(env, 0) } + +JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv *env, jclass, + jlong j_view_handle, jlong j_scalar_handle) { + + JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0); + JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0); + + try { + cudf::jni::auto_set_device(env); + cudf::column_view* n_column_view = reinterpret_cast(j_view_handle); + cudf::strings_column_view n_strings_col_view(*n_column_view); + cudf::string_scalar *n_scalar_path = reinterpret_cast(j_scalar_handle); + + auto result = cudf::strings::get_json_object(n_strings_col_view, *n_scalar_path); + + return reinterpret_cast(result.release()); + } + CATCH_STD(env, 0) + +} } // extern "C" diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index fe1cba5ceb1..ce2c287a1c8 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -4132,6 +4132,50 @@ void testCopyToColumnVector() { } } + @Test + void testGetJSONObject() { + String jsonString = "{ \"store\": {\n" + + " \"book\": [\n" + + " { \"category\": \"reference\",\n" + + " \"author\": \"Nigel Rees\",\n" + + " \"title\": \"Sayings of the Century\",\n" + + " \"price\": 8.95\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"Evelyn Waugh\",\n" + + " \"title\": \"Sword of Honour\",\n" + + " \"price\": 12.99\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"Herman Melville\",\n" + + " \"title\": \"Moby Dick\",\n" + + " \"isbn\": \"0-553-21311-3\",\n" + + " \"price\": 8.99\n" + + " },\n" + + " { \"category\": \"fiction\",\n" + + " \"author\": \"J. R. R. Tolkien\",\n" + + " \"title\": \"The Lord of the Rings\",\n" + + " \"isbn\": \"0-395-19395-8\",\n" + + " \"price\": 22.99\n" + + " }\n" + + " ],\n" + + " \"bicycle\": {\n" + + " \"color\": \"red\",\n" + + " \"price\": 19.95\n" + + " }\n" + + " }\n" + + "}"; + + try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString); + ColumnVector expectedAuthors = ColumnVector.fromStrings("[\"Nigel Rees\",\"Evelyn " + + "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]", "[\"Nigel Rees\",\"Evelyn " + + "Waugh\",\"Herman Melville\",\"J. R. R. Tolkien\"]"); + Scalar path = Scalar.fromString("$.store.book[*].author"); + ColumnVector gotAuthors = json.getJSONObject(path)) { + assertColumnsAreEqual(expectedAuthors, gotAuthors); + } + } + @Test void testMakeStructEmpty() { final int numRows = 10; From c05dbed52fdd15757e40463a64ce757d6cd21b46 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 31 Mar 2021 12:32:09 -0500 Subject: [PATCH 06/59] Add column names validation in parquet writer (#7786) Fixes: #7738 Parquet writer requires all column names to be of string types, added a validation similar to that of pandas. Authors: - GALI PREM SAGAR (@galipremsagar) Approvers: - Michael Wang (@isVoid) - Keith Kraus (@kkraus14) URL: https://github.com/rapidsai/cudf/pull/7786 --- python/cudf/cudf/_lib/parquet.pyx | 3 +++ python/cudf/cudf/tests/test_parquet.py | 14 +++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index d8b4fbbbe4b..4ea2adec23a 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -312,6 +312,9 @@ cpdef write_parquet( num_index_cols_meta = 0 for i, name in enumerate(table._column_names, num_index_cols_meta): + if not isinstance(name, str): + raise ValueError("parquet must have string column names") + tbl_meta.get().column_metadata[i].set_name(name.encode()) _set_col_metadata( table[name]._column, tbl_meta.get().column_metadata[i] diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index fe418d1ade1..4781ff995b0 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -19,7 +19,7 @@ import cudf from cudf.io.parquet import ParquetWriter, merge_parquet_filemetadata from cudf.tests import dataset_generator as dg -from cudf.tests.utils import assert_eq +from cudf.tests.utils import assert_eq, assert_exceptions_equal @pytest.fixture(scope="module") @@ -1937,3 +1937,15 @@ def test_parquet_writer_decimal(tmpdir): got = pd.read_parquet(fname) assert_eq(gdf, got) + + +def test_parquet_writer_column_validation(): + df = cudf.DataFrame({1: [1, 2, 3], "1": ["a", "b", "c"]}) + pdf = df.to_pandas() + + assert_exceptions_equal( + lfunc=df.to_parquet, + rfunc=pdf.to_parquet, + lfunc_args_and_kwargs=(["cudf.parquet"],), + rfunc_args_and_kwargs=(["pandas.parquet"],), + ) From acb69858808ff50ec2b57bde6fc5b4920732e31a Mon Sep 17 00:00:00 2001 From: Thomas Graves Date: Wed, 31 Mar 2021 13:39:31 -0500 Subject: [PATCH 07/59] Turn on NVTX by default in java build (#7761) Investigation was done under https://github.com/NVIDIA/spark-rapids/issues/1721 and it showed no significant performance difference with NVTX on. It would make it a lot easier if this was on by default because it allows customers and developers to get trace with the same jar without having to go off and build a new CUDF version. So this PR turns it on by default and adds in reading from environment variable if we need to change in the future from build scripts. Authors: - Thomas Graves (@tgravescs) Approvers: - Jason Lowe (@jlowe) - Robert (Bobby) Evans (@revans2) URL: https://github.com/rapidsai/cudf/pull/7761 --- java/ci/build-in-docker.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh index eee943cde38..b2d0b066ce7 100755 --- a/java/ci/build-in-docker.sh +++ b/java/ci/build-in-docker.sh @@ -24,6 +24,7 @@ SKIP_JAVA_TESTS=${SKIP_JAVA_TESTS:-true} BUILD_CPP_TESTS=${BUILD_CPP_TESTS:-OFF} ENABLE_PTDS=${ENABLE_PTDS:-ON} RMM_LOGGING_LEVEL=${RMM_LOGGING_LEVEL:-OFF} +ENABLE_NVTX=${ENABLE_NVTX:-ON} OUT=${OUT:-out} SIGN_FILE=$1 @@ -35,6 +36,7 @@ echo "SIGN_FILE: $SIGN_FILE,\ SKIP_JAVA_TESTS: $SKIP_JAVA_TESTS,\ BUILD_CPP_TESTS: $BUILD_CPP_TESTS,\ ENABLED_PTDS: $ENABLE_PTDS,\ + ENABLE_NVTX: $ENABLE_NVTX,\ RMM_LOGGING_LEVEL: $RMM_LOGGING_LEVEL,\ OUT_PATH: $OUT_PATH" @@ -51,7 +53,7 @@ export PATH=/usr/local/cmake-3.19.0-Linux-x86_64/bin:$PATH rm -rf $WORKSPACE/cpp/build mkdir -p $WORKSPACE/cpp/build cd $WORKSPACE/cpp/build -cmake .. -DUSE_NVTX=OFF -DCUDF_USE_ARROW_STATIC=ON -DBoost_USE_STATIC_LIBS=ON -DBUILD_TESTS=$SKIP_CPP_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL +cmake .. -DUSE_NVTX=$ENABLE_NVTX -DCUDF_USE_ARROW_STATIC=ON -DBoost_USE_STATIC_LIBS=ON -DBUILD_TESTS=$SKIP_CPP_TESTS -DPER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS -DRMM_LOGGING_LEVEL=$RMM_LOGGING_LEVEL make -j$PARALLEL_LEVEL make install DESTDIR=$INSTALL_PREFIX From d804a47818313bb1277c738c3ab5fe8d4778f631 Mon Sep 17 00:00:00 2001 From: pxLi Date: Thu, 1 Apr 2021 02:58:07 +0800 Subject: [PATCH 08/59] update Java bindings version to 0.20 (#7747) Updating the Java bindings package version to match the libcudf version for 0.20 also as [Deprecation announcement for CUDA 10.1 & 10.2 in v0.19](https://docs.rapids.ai/notices/rsn0005/), to update README to drop cuda 10.1 and cuda 10.2. Please let me know if we are going to support other cuda version 11.x except 11.0 Authors: - pxLi (@pxLi) Approvers: - Jason Lowe (@jlowe) - Robert (Bobby) Evans (@revans2) URL: https://github.com/rapidsai/cudf/pull/7747 --- java/README.md | 4 ++-- java/ci/README.md | 14 ++++++-------- java/pom.xml | 2 +- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/java/README.md b/java/README.md index 6ca58496605..366d014db95 100644 --- a/java/README.md +++ b/java/README.md @@ -38,12 +38,12 @@ In some cases there may be a classifier to indicate the version of cuda required Build From Source section below for more information about when this can happen. No official release of the jar will have a classifier on it. -CUDA 10.0: +CUDA 11.0: ```xml ai.rapids cudf - cuda10 + cuda11 ${cudf.version} ``` diff --git a/java/ci/README.md b/java/ci/README.md index 3ffed71b27c..8f45c0f89af 100644 --- a/java/ci/README.md +++ b/java/ci/README.md @@ -11,16 +11,14 @@ In the root path of cuDF repo, run below command to build the docker image. ```bash -docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=10.1 -t cudf-build:10.1-devel-centos7 . +docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=11.0 -t cudf-build:11.0-devel-centos7 . ``` -We support different CUDA versions as below: -* CUDA 10.1 -* CUDA 10.2 +The following CUDA versions are supported: * CUDA 11.0 Change the --build-arg CUDA_VERSION to what you need. -You can replace the tag "cudf-build:10.1-devel-centos7" with another name you like. +You can replace the tag "cudf-build:11.0-devel-centos7" with another name you like. ## Start the docker then build @@ -28,7 +26,7 @@ You can replace the tag "cudf-build:10.1-devel-centos7" with another name you li Run below command to start a docker container with GPU. ```bash -nvidia-docker run -it cudf-build:10.1-devel-centos7 bash +nvidia-docker run -it cudf-build:11.0-devel-centos7 bash ``` ### Download the cuDF source code @@ -36,7 +34,7 @@ nvidia-docker run -it cudf-build:10.1-devel-centos7 bash You can download the cuDF repo in the docker container or you can mount it into the container. Here I choose to download again in the container. ```bash -git clone --recursive https://github.com/rapidsai/cudf.git -b branch-0.19 +git clone --recursive https://github.com/rapidsai/cudf.git -b branch-0.20 ``` ### Build cuDF jar with devtoolset @@ -49,5 +47,5 @@ scl enable devtoolset-8 "java/ci/build-in-docker.sh" ### The output -You can find the cuDF jar in java/target/ like cudf-0.19-SNAPSHOT-cuda10-1.jar. +You can find the cuDF jar in java/target/ like cudf-0.20-SNAPSHOT-cuda11.jar. diff --git a/java/pom.xml b/java/pom.xml index a3fd464b320..d94d51944a0 100755 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,7 +21,7 @@ ai.rapids cudf - 0.19-SNAPSHOT + 0.20-SNAPSHOT cudfjni From 4d6ea76be7b98926b09b93d5f4a309258c8843e4 Mon Sep 17 00:00:00 2001 From: Rong Ou Date: Wed, 31 Mar 2021 13:36:33 -0700 Subject: [PATCH 09/59] add copy methods in Java memory buffer (#7791) This should simplify the code in the rapids shuffle manager (see https://github.com/NVIDIA/spark-rapids/pull/2050). @jlowe @abellina @revans2 Authors: - Rong Ou (https://github.com/rongou) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/7791 --- .../java/ai/rapids/cudf/MemoryBuffer.java | 33 ++++ .../java/ai/rapids/cudf/MemoryBufferTest.java | 171 ++++++++++++++++++ 2 files changed, 204 insertions(+) create mode 100644 java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java diff --git a/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java index a1be9b561a0..9f0d9a451c0 100644 --- a/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java +++ b/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java @@ -146,6 +146,39 @@ public final long getAddress() { return address; } + /** + * Copy a subset of src to this buffer starting at destOffset using the specified CUDA stream. + * The copy has completed when this returns, but the memory copy could overlap with + * operations occurring on other streams. + * @param destOffset the offset in this to start copying from. + * @param src what to copy from + * @param srcOffset offset into src to start out + * @param length how many bytes to copy + * @param stream CUDA stream to use + */ + public final void copyFromMemoryBuffer( + long destOffset, MemoryBuffer src, long srcOffset, long length, Cuda.Stream stream) { + addressOutOfBoundsCheck(address + destOffset, length, "copy range dest"); + src.addressOutOfBoundsCheck(src.address + srcOffset, length, "copy range src"); + Cuda.memcpy(address + destOffset, src.address + srcOffset, length, CudaMemcpyKind.DEFAULT, stream); + } + + /** + * Copy a subset of src to this buffer starting at destOffset using the specified CUDA stream. + * The copy is async and may not have completed when this returns. + * @param destOffset the offset in this to start copying from. + * @param src what to copy from + * @param srcOffset offset into src to start out + * @param length how many bytes to copy + * @param stream CUDA stream to use + */ + public final void copyFromMemoryBufferAsync( + long destOffset, MemoryBuffer src, long srcOffset, long length, Cuda.Stream stream) { + addressOutOfBoundsCheck(address + destOffset, length, "copy range dest"); + src.addressOutOfBoundsCheck(src.address + srcOffset, length, "copy range src"); + Cuda.asyncMemcpy(address + destOffset, src.address + srcOffset, length, CudaMemcpyKind.DEFAULT, stream); + } + /** * Slice off a part of the buffer. Note that this is a zero copy operation and all * slices must be closed along with the original buffer before the memory is released. diff --git a/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java new file mode 100644 index 00000000000..df710c71f63 --- /dev/null +++ b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java @@ -0,0 +1,171 @@ +/* + * + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +package ai.rapids.cudf; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +public class MemoryBufferTest extends CudfTestBase { + private static final byte[] BYTES = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + private static final byte[] EXPECTED = {0, 2, 3, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + + @Test + public void testAddressOutOfBoundsExceptionWhenCopying() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(-1, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(16, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, -1, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, 16, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, 0, -1, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, 0, 17, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(1, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBuffer(0, from, 1, 16, Cuda.DEFAULT_STREAM)); + } + } + + @Test + public void testAddressOutOfBoundsExceptionWhenCopyingAsync() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(-1, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(16, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, -1, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, 16, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, 0, -1, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, 0, 17, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(1, from, 0, 16, Cuda.DEFAULT_STREAM)); + assertThrows(AssertionError.class, () -> to.copyFromMemoryBufferAsync(0, from, 1, 16, Cuda.DEFAULT_STREAM)); + } + } + + @Test + public void testCopyingFromDeviceToDevice() { + try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16); + DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) { + in.setBytes(0, BYTES, 0, 16); + from.copyFromHostBuffer(in); + to.copyFromMemoryBuffer(0, from, 0, 16, Cuda.DEFAULT_STREAM); + to.copyFromMemoryBuffer(1, from, 2, 3, Cuda.DEFAULT_STREAM); + out.copyFromDeviceBuffer(to); + verifyOutput(out); + } + } + + @Test + public void testCopyingFromDeviceToDeviceAsync() { + try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16); + DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) { + in.setBytes(0, BYTES, 0, 16); + from.copyFromHostBuffer(in); + to.copyFromMemoryBufferAsync(0, from, 0, 16, Cuda.DEFAULT_STREAM); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + out.copyFromDeviceBufferAsync(to, Cuda.DEFAULT_STREAM); + Cuda.DEFAULT_STREAM.sync(); + verifyOutput(out); + } + } + + @Test + public void testCopyingFromHostToHost() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + from.setBytes(0, BYTES, 0, 16); + to.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBuffer(1, from, 2, 3, Cuda.DEFAULT_STREAM); + verifyOutput(to); + } + } + + @Test + public void testCopyingFromHostToHostAsync() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + from.setBytes(0, BYTES, 0, 16); + to.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + verifyOutput(to); + } + } + + @Test + public void testCopyingFromHostToDevice() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) { + from.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBuffer(0, from, 0, 16, Cuda.DEFAULT_STREAM); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + out.copyFromDeviceBuffer(to); + verifyOutput(out); + } + } + + @Test + public void testCopyingFromHostToDeviceAsync() { + try (HostMemoryBuffer from = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer to = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer out = HostMemoryBuffer.allocate(16)) { + from.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBufferAsync(0, from, 0, 16, Cuda.DEFAULT_STREAM); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + out.copyFromDeviceBufferAsync(to, Cuda.DEFAULT_STREAM); + Cuda.DEFAULT_STREAM.sync(); + verifyOutput(out); + } + } + + @Test + public void testCopyingFromDeviceToHost() { + try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + in.setBytes(0, BYTES, 0, 16); + from.copyFromHostBuffer(in); + to.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBuffer(1, from, 2, 3, Cuda.DEFAULT_STREAM); + verifyOutput(to); + } + } + + @Test + public void testCopyingFromDeviceToHostAsync() { + try (HostMemoryBuffer in = HostMemoryBuffer.allocate(16); + DeviceMemoryBuffer from = DeviceMemoryBuffer.allocate(16); + HostMemoryBuffer to = HostMemoryBuffer.allocate(16)) { + in.setBytes(0, BYTES, 0, 16); + from.copyFromHostBuffer(in); + to.setBytes(0, BYTES, 0, 16); + to.copyFromMemoryBufferAsync(1, from, 2, 3, Cuda.DEFAULT_STREAM); + Cuda.DEFAULT_STREAM.sync(); + verifyOutput(to); + } + } + + private void verifyOutput(HostMemoryBuffer out) { + byte[] bytes = new byte[16]; + out.getBytes(bytes, 0, 0, 16); + assertArrayEquals(EXPECTED, bytes); + } +} From 9970f1df40c848615e30b06a2b3d95bd413f8532 Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Wed, 31 Mar 2021 16:23:10 -0500 Subject: [PATCH 10/59] Struct hashing support for SerialMurmur3 and SparkMurmur3 (#7714) Adding struct column support for serial Murmur3 and Spark-compatible Murmur3 hashing. This explodes the struct column into the leaf columns before passing it to the existing hash support. The validity of the parent struct columns can be ignored because hashing a null ends up as a no-op that returns the hash seed, so only the leaf columns within the struct column need to be considered for the hash computation. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Conor Hoekstra (https://github.com/codereport) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/7714 --- .../cudf/detail/utilities/hash_functions.cuh | 32 +++++ cpp/src/hash/hashing.cu | 25 +++- cpp/tests/hashing/hash_test.cpp | 125 ++++++++++++------ .../java/ai/rapids/cudf/ColumnVector.java | 5 +- .../java/ai/rapids/cudf/ColumnVectorTest.java | 48 ++++++- 5 files changed, 186 insertions(+), 49 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 31533a69487..e79107e32cf 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -542,6 +542,22 @@ hash_value_type CUDA_DEVICE_CALLABLE MurmurHash3_32::operator()(double c return this->compute_floating_point(key); } +template <> +hash_value_type CUDA_DEVICE_CALLABLE +MurmurHash3_32::operator()(cudf::list_view const& key) const +{ + cudf_assert(false && "List column hashing is not supported"); + return 0; +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +MurmurHash3_32::operator()(cudf::struct_view const& key) const +{ + cudf_assert(false && "Direct hashing of struct_view is not supported"); + return 0; +} + template struct SparkMurmurHash3_32 { using argument_type = Key; @@ -671,6 +687,22 @@ SparkMurmurHash3_32::operator()(numeric::decimal64 const& ke return this->compute(key.value()); } +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(cudf::list_view const& key) const +{ + cudf_assert(false && "List column hashing is not supported"); + return 0; +} + +template <> +hash_value_type CUDA_DEVICE_CALLABLE +SparkMurmurHash3_32::operator()(cudf::struct_view const& key) const +{ + cudf_assert(false && "Direct hashing of struct_view is not supported"); + return 0; +} + /** * @brief Specialization of MurmurHash3_32 operator for strings. */ diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu index 16efb666b3e..53be019f73b 100644 --- a/cpp/src/hash/hashing.cu +++ b/cpp/src/hash/hashing.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,8 @@ #include +#include + namespace cudf { namespace { @@ -38,6 +40,22 @@ bool md5_type_check(data_type dt) return !is_chrono(dt) && (is_fixed_width(dt) || (dt.id() == type_id::STRING)); } +template +std::vector to_leaf_columns(IterType iter_begin, IterType iter_end) +{ + std::vector leaf_columns; + std::for_each(iter_begin, iter_end, [&leaf_columns](column_view const& col) { + if (is_nested(col.type())) { + CUDF_EXPECTS(col.type().id() == type_id::STRUCT, "unsupported nested type"); + auto child_columns = to_leaf_columns(col.child_begin(), col.child_end()); + leaf_columns.insert(leaf_columns.end(), child_columns.begin(), child_columns.end()); + } else { + leaf_columns.emplace_back(col); + } + }); + return leaf_columns; +} + } // namespace namespace detail { @@ -133,10 +151,11 @@ std::unique_ptr serial_murmur_hash3_32(table_view const& input, if (input.num_columns() == 0 || input.num_rows() == 0) { return output; } - auto const device_input = table_device_view::create(input, stream); + table_view const leaf_table(to_leaf_columns(input.begin(), input.end())); + auto const device_input = table_device_view::create(leaf_table, stream); auto output_view = output->mutable_view(); - if (has_nulls(input)) { + if (has_nulls(leaf_table)) { thrust::tabulate(rmm::exec_policy(stream), output_view.begin(), output_view.end(), diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp index 5641d445ff3..d928a17b3d1 100644 --- a/cpp/tests/hashing/hash_test.cpp +++ b/cpp/tests/hashing/hash_test.cpp @@ -257,20 +257,35 @@ TEST_F(SerialMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const bools_col1({0, 1, 1, 1, 0}); fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); - auto const input1 = cudf::table_view({strings_col}); - auto const input2 = cudf::table_view({ints_col}); - auto const input3 = cudf::table_view({strings_col, ints_col, bools_col1}); - auto const input4 = cudf::table_view({strings_col, ints_col, bools_col2}); - - auto const hashed_output1 = cudf::hash(input1, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 314); - auto const hashed_output2 = cudf::hash(input2, cudf::hash_id::HASH_SERIAL_MURMUR3, {}, 42); - auto const hashed_output3 = cudf::hash(input3, cudf::hash_id::HASH_SERIAL_MURMUR3, {}); - auto const hashed_output4 = cudf::hash(input4, cudf::hash_id::HASH_SERIAL_MURMUR3, {}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output1->view(), strings_col_result, true); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output2->view(), ints_col_result, true); - EXPECT_EQ(input3.num_rows(), hashed_output3->size()); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(hashed_output3->view(), hashed_output4->view(), true); + std::vector> struct_field_cols; + struct_field_cols.emplace_back(std::make_unique(strings_col)); + struct_field_cols.emplace_back(std::make_unique(ints_col)); + struct_field_cols.emplace_back(std::make_unique(bools_col1)); + structs_column_wrapper structs_col(std::move(struct_field_cols)); + + auto const combo1 = cudf::table_view({strings_col, ints_col, bools_col1}); + auto const combo2 = cudf::table_view({strings_col, ints_col, bools_col2}); + + constexpr auto hasher = cudf::hash_id::HASH_SERIAL_MURMUR3; + auto const strings_hash = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314); + auto const ints_hash = cudf::hash(cudf::table_view({ints_col}), hasher, {}, 42); + auto const combo1_hash = cudf::hash(combo1, hasher, {}); + auto const combo2_hash = cudf::hash(combo2, hasher, {}); + auto const structs_hash = cudf::hash(cudf::table_view({structs_col}), hasher, {}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*strings_hash, strings_col_result, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*ints_hash, ints_col_result, true); + EXPECT_EQ(combo1.num_rows(), combo1_hash->size()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*combo1_hash, *combo2_hash, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*structs_hash, *combo1_hash, true); +} + +TEST_F(SerialMurmurHash3Test, ListThrows) +{ + lists_column_wrapper strings_list_col({{""}, {"abc"}, {"123"}}); + EXPECT_THROW( + cudf::hash(cudf::table_view({strings_list_col}), cudf::hash_id::HASH_SERIAL_MURMUR3, {}), + cudf::logic_error); } class SparkMurmurHash3Test : public cudf::test::BaseFixture { @@ -280,31 +295,38 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) { // The hash values were determined by running the following Scala code in Apache Spark: // import org.apache.spark.sql.catalyst.util.DateTimeUtils - // val schema = new StructType().add("strings",StringType).add("doubles",DoubleType) - // .add("timestamps",TimestampType).add("decimal64", DecimalType(18,7)).add("longs",LongType) - // .add("floats",FloatType).add("dates",DateType).add("decimal32", DecimalType(9,3)) - // .add("ints",IntegerType).add("shorts",ShortType).add("bytes",ByteType) - // .add("bools",BooleanType) + // val schema = new StructType().add("structs", new StructType().add("a",IntegerType) + // .add("b",StringType).add("c",new StructType().add("x",FloatType).add("y",LongType))) + // .add("strings",StringType).add("doubles",DoubleType).add("timestamps",TimestampType) + // .add("decimal64", DecimalType(18,7)).add("longs",LongType).add("floats",FloatType) + // .add("dates",DateType).add("decimal32", DecimalType(9,3)).add("ints",IntegerType) + // .add("shorts",ShortType).add("bytes",ByteType).add("bools",BooleanType) // val data = Seq( - // Row("", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat, - // DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte, false), - // Row("The quick brown fox", -(0.toDouble), DateTimeUtils.toJavaTimestamp(100), - // BigDecimal("0.00001"), 100.toLong, -(0.toFloat), DateTimeUtils.toJavaDate(100), - // BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true), - // Row("jumps over the lazy dog.", -Double.NaN, DateTimeUtils.toJavaTimestamp(-100), - // BigDecimal("-0.00001"), -100.toLong, -Float.NaN, DateTimeUtils.toJavaDate(-100), - // BigDecimal("-0.1"), -100, -100.toShort, -100.toByte, true), - // Row("All work and no play makes Jack a dull boy", Double.MinValue, - // DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"), - // Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100), - // BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true), - // Row("!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue, - // DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"), - // Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100), - // BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false)) + // Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble, DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), + // 0.toLong, 0.toFloat, DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte, + // false), + // Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble), + // DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat), + // DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte, true), + // Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN, + // DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN, + // DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte, + // true), + // Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)), + // "All work and no play makes Jack a dull boy", Double.MinValue, + // DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"), + // Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100), + // BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true), + // Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)), + // "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue, + // DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"), + // Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100), + // BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false)) // val df = spark.createDataFrame(sc.parallelize(data), schema) // df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}")) // df.select(hash(col("*"))).collect + fixed_width_column_wrapper const hash_structs_expected( + {-105406170, 90479889, -678041645, 1667387937, 301478567}); fixed_width_column_wrapper const hash_strings_expected( {1467149710, 723257560, -1620282500, -2001858707, 1588473657}); fixed_width_column_wrapper const hash_doubles_expected( @@ -330,18 +352,26 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const hash_bools_expected( {933211791, -559580957, -559580957, -559580957, 933211791}); fixed_width_column_wrapper const hash_combined_expected( - {-1947042614, -1731440908, 807283935, 725489209, 822276819}); + {-1172364561, -442972638, 1213234395, 796626751, 214075225}); + + using double_limits = std::numeric_limits; + using long_limits = std::numeric_limits; + using float_limits = std::numeric_limits; + using int_limits = std::numeric_limits; + fixed_width_column_wrapper a_col{0, 100, -100, 0x12345678, -0x76543210}; + strings_column_wrapper b_col{"a", "bc", "def", "ghij", "klmno"}; + fixed_width_column_wrapper x_col{ + 0.f, 100.f, -100.f, float_limits::infinity(), -float_limits::infinity()}; + fixed_width_column_wrapper y_col{ + 0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL}; + structs_column_wrapper c_col{{x_col, y_col}}; + structs_column_wrapper const structs_col{{a_col, b_col, c_col}}; strings_column_wrapper const strings_col({"", "The quick brown fox", "jumps over the lazy dog.", "All work and no play makes Jack a dull boy", "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"}); - - using double_limits = std::numeric_limits; - using long_limits = std::numeric_limits; - using float_limits = std::numeric_limits; - using int_limits = std::numeric_limits; fixed_width_column_wrapper const doubles_col( {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()}); fixed_width_column_wrapper const timestamps_col( @@ -364,6 +394,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) fixed_width_column_wrapper const bools_col2({0, 1, 2, 255, 0}); constexpr auto hasher = cudf::hash_id::HASH_SPARK_MURMUR3; + auto const hash_structs = cudf::hash(cudf::table_view({structs_col}), hasher, {}, 42); auto const hash_strings = cudf::hash(cudf::table_view({strings_col}), hasher, {}, 314); auto const hash_doubles = cudf::hash(cudf::table_view({doubles_col}), hasher, {}, 42); auto const hash_timestamps = cudf::hash(cudf::table_view({timestamps_col}), hasher, {}, 42); @@ -378,6 +409,7 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) auto const hash_bools1 = cudf::hash(cudf::table_view({bools_col1}), hasher, {}, 42); auto const hash_bools2 = cudf::hash(cudf::table_view({bools_col2}), hasher, {}, 42); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, true); @@ -392,7 +424,8 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, true); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, true); - auto const combined_table = cudf::table_view({strings_col, + auto const combined_table = cudf::table_view({structs_col, + strings_col, doubles_col, timestamps_col, decimal64_col, @@ -408,6 +441,14 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, true); } +TEST_F(SparkMurmurHash3Test, ListThrows) +{ + lists_column_wrapper strings_list_col({{""}, {"abc"}, {"123"}}); + EXPECT_THROW( + cudf::hash(cudf::table_view({strings_list_col}), cudf::hash_id::HASH_SPARK_MURMUR3, {}), + cudf::logic_error); +} + class MD5HashTest : public cudf::test::BaseFixture { }; diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java index e6675591164..fcdb5d44ad3 100644 --- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java +++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java @@ -570,8 +570,7 @@ public static ColumnVector serial32BitMurmurHash3(int seed, ColumnView columns[] assert columns[i] != null : "Column vectors passed may not be null"; assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size"; assert !columns[i].getType().isDurationType() : "Unsupported column type Duration"; - assert !columns[i].getType().isTimestampType() : "Unsupported column type Timestamp"; - assert !columns[i].getType().isNestedType() : "Unsupported column of nested type"; + assert !columns[i].getType().equals(DType.LIST) : "List columns are not supported"; columnViews[i] = columns[i].getNativeView(); } return new ColumnVector(hash(columnViews, HashType.HASH_SERIAL_MURMUR3.getNativeId(), new int[0], seed)); @@ -606,7 +605,7 @@ public static ColumnVector spark32BitMurmurHash3(int seed, ColumnView columns[]) assert columns[i] != null : "Column vectors passed may not be null"; assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size"; assert !columns[i].getType().isDurationType() : "Unsupported column type Duration"; - assert !columns[i].getType().isNestedType() : "Unsupported column of nested type"; + assert !columns[i].getType().equals(DType.LIST) : "List columns are not supported"; columnViews[i] = columns[i].getNativeView(); } return new ColumnVector(hash(columnViews, HashType.HASH_SPARK_MURMUR3.getNativeId(), new int[0], seed)); diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java index ce2c287a1c8..36123704ae6 100644 --- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java +++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java @@ -490,6 +490,25 @@ void testSerial32BitMurmur3HashMixed() { } } + @Test + void testSerial32BitMurmur3HashStruct() { + try (ColumnVector strings = ColumnVector.fromStrings( + "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.", + null, null); + ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles( + 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null); + ColumnVector result = ColumnVector.serial32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools}); + ColumnVector expected = ColumnVector.fromBoxedInts(387200465, 1988790727, 774895031, 814731646, -1073686048, 1868)) { + assertColumnsAreEqual(expected, result); + } + } + @Test void testSpark32BitMurmur3HashStrings() { try (ColumnVector v0 = ColumnVector.fromStrings( @@ -529,6 +548,8 @@ void testSpark32BitMurmur3HashDoubles() { @Test void testSpark32BitMurmur3HashTimestamps() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs( 0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -539,6 +560,8 @@ void testSpark32BitMurmur3HashTimestamps() { @Test void testSpark32BitMurmur3HashDecimal64() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.decimalFromLongs(-7, 0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -549,6 +572,8 @@ void testSpark32BitMurmur3HashDecimal64() { @Test void testSpark32BitMurmur3HashDecimal32() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.decimalFromInts(-3, 0, 100, -100, 0x12345678, -0x12345678); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -559,6 +584,8 @@ void testSpark32BitMurmur3HashDecimal32() { @Test void testSpark32BitMurmur3HashDates() { + // The hash values were derived from Apache Spark in a manner similar to the one documented at + // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307 try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts( 0, null, 100, -100, 0x12345678, null, -0x12345678); ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v}); @@ -587,7 +614,6 @@ void testSpark32BitMurmur3HashBools() { ColumnVector result = ColumnVector.spark32BitMurmurHash3(0, new ColumnVector[]{v0, v1}); ColumnVector expected = ColumnVector.fromBoxedInts(0, -1589400010, -239939054, -68075478, 593689054, -1194558265)) { assertColumnsAreEqual(expected, result); - } } @@ -610,6 +636,26 @@ void testSpark32BitMurmur3HashMixed() { } } + @Test + void testSpark32BitMurmur3HashStruct() { + try (ColumnVector strings = ColumnVector.fromStrings( + "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721", + "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " + + "in the MD5 hash function. This string needed to be longer.", + null, null); + ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null); + ColumnVector doubles = ColumnVector.fromBoxedDoubles( + 0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null); + ColumnVector floats = ColumnVector.fromBoxedFloats( + 0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null); + ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null); + ColumnView structs = ColumnView.makeStructView(strings, integers, doubles, floats, bools); + ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structs}); + ColumnVector expected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools})) { + assertColumnsAreEqual(expected, result); + } + } + @Test void testAndNullReconfigureNulls() { try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null); From 684bb146b918bdbaaf3d7ce47d00b51245fe12e7 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 31 Mar 2021 16:23:37 -0500 Subject: [PATCH 11/59] Fix inplace update of data and add Series.update (#7201) Fixes: #7187 This PR: - [x] Fixes inplace manipulation of columns. - [x] Introduces `Series.update` - [x] Fixes incorrect dtype handling in `Frame.where` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) - Keith Kraus (https://github.com/kkraus14) URL: https://github.com/rapidsai/cudf/pull/7201 --- python/cudf/cudf/_lib/copying.pyx | 6 +- python/cudf/cudf/core/__init__.py | 4 +- python/cudf/cudf/core/_internals/__init__.py | 3 + python/cudf/cudf/core/_internals/where.py | 383 +++++++++++++++++++ python/cudf/cudf/core/dataframe.py | 8 +- python/cudf/cudf/core/frame.py | 232 +---------- python/cudf/cudf/core/series.py | 104 +++++ python/cudf/cudf/tests/test_dataframe.py | 13 +- python/cudf/cudf/tests/test_replace.py | 27 +- python/cudf/cudf/tests/test_series.py | 49 +++ python/cudf/cudf/utils/dtypes.py | 66 +++- 11 files changed, 646 insertions(+), 249 deletions(-) create mode 100644 python/cudf/cudf/core/_internals/__init__.py create mode 100644 python/cudf/cudf/core/_internals/where.py diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 4c72ba2e055..8f93866612e 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. import pandas as pd @@ -564,11 +564,11 @@ def copy_if_else(object lhs, object rhs, Column boolean_mask): return _copy_if_else_column_column(lhs, rhs, boolean_mask) else: return _copy_if_else_column_scalar( - lhs, as_device_scalar(rhs, lhs.dtype), boolean_mask) + lhs, as_device_scalar(rhs), boolean_mask) else: if isinstance(rhs, Column): return _copy_if_else_scalar_column( - as_device_scalar(lhs, rhs.dtype), rhs, boolean_mask) + as_device_scalar(lhs), rhs, boolean_mask) else: if lhs is None and rhs is None: return lhs diff --git a/python/cudf/cudf/core/__init__.py b/python/cudf/cudf/core/__init__.py index 91a369c31f8..59173cc0247 100644 --- a/python/cudf/cudf/core/__init__.py +++ b/python/cudf/cudf/core/__init__.py @@ -1,6 +1,6 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. -from cudf.core import buffer, column, column_accessor, common +from cudf.core import _internals, buffer, column, column_accessor, common from cudf.core.buffer import Buffer from cudf.core.dataframe import DataFrame, from_pandas, merge from cudf.core.index import ( diff --git a/python/cudf/cudf/core/_internals/__init__.py b/python/cudf/cudf/core/_internals/__init__.py new file mode 100644 index 00000000000..53d186def85 --- /dev/null +++ b/python/cudf/cudf/core/_internals/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from cudf.core._internals.where import where diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py new file mode 100644 index 00000000000..1fdc907875e --- /dev/null +++ b/python/cudf/cudf/core/_internals/where.py @@ -0,0 +1,383 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +import warnings +from typing import Any, Optional, Tuple, Union, cast + +import numpy as np +import pandas as pd + +import cudf +from cudf._typing import ColumnLike, ScalarLike +from cudf.core.column import ColumnBase +from cudf.core.dataframe import DataFrame +from cudf.core.frame import Frame +from cudf.core.index import Index +from cudf.core.series import Series + + +def _normalize_scalars(col: ColumnBase, other: ScalarLike) -> ScalarLike: + """ + Try to normalize scalar values as per col dtype + """ + if (isinstance(other, float) and not np.isnan(other)) and ( + col.dtype.type(other) != other + ): + raise TypeError( + f"Cannot safely cast non-equivalent " + f"{type(other).__name__} to {col.dtype.name}" + ) + + return cudf.Scalar(other, dtype=col.dtype if other is None else None) + + +def _check_and_cast_columns_with_other( + source_col: ColumnBase, + other: Union[ScalarLike, ColumnBase], + inplace: bool, +) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]: + """ + Returns type-casted column `source_col` & scalar `other_scalar` + based on `inplace` parameter. + """ + if cudf.utils.dtypes.is_categorical_dtype(source_col.dtype): + return source_col, other + + if cudf.utils.dtypes.is_scalar(other): + device_obj = _normalize_scalars(source_col, other) + else: + device_obj = other + + if other is None: + return source_col, device_obj + elif cudf.utils.dtypes.is_mixed_with_object_dtype(device_obj, source_col): + raise TypeError( + "cudf does not support mixed types, please type-cast " + "the column of dataframe/series and other " + "to same dtypes." + ) + if inplace: + if not cudf.utils.dtypes._can_cast(device_obj.dtype, source_col.dtype): + warnings.warn( + f"Type-casting from {device_obj.dtype} " + f"to {source_col.dtype}, there could be potential data loss" + ) + return source_col, device_obj.astype(source_col.dtype) + else: + if ( + cudf.utils.dtypes.is_scalar(other) + and cudf.utils.dtypes.is_numerical_dtype(source_col.dtype) + and cudf.utils.dtypes._can_cast(other, source_col.dtype) + ): + common_dtype = source_col.dtype + return ( + source_col.astype(common_dtype), + cudf.Scalar(other, dtype=common_dtype), + ) + else: + common_dtype = cudf.utils.dtypes.find_common_type( + [ + source_col.dtype, + np.min_scalar_type(other) + if cudf.utils.dtypes.is_scalar(other) + else other.dtype, + ] + ) + if cudf.utils.dtypes.is_scalar(device_obj): + device_obj = cudf.Scalar(other, dtype=common_dtype) + else: + device_obj = device_obj.astype(common_dtype) + return source_col.astype(common_dtype), device_obj + + +def _normalize_columns_and_scalars_type( + frame: Union[Series, Index, DataFrame], other: Any, inplace: bool = False, +) -> Tuple[ + Union[Series, Index, DataFrame, ColumnLike], Any, +]: + """ + Try to normalize the other's dtypes as per frame. + + Parameters + ---------- + + frame : Can be a DataFrame or Series or Index + other : Can be a DataFrame, Series, Index, Array + like object or a scalar value + + if frame is DataFrame, other can be only a + scalar or array like with size of number of columns + in DataFrame or a DataFrame with same dimension + + if frame is Series, other can be only a scalar or + a series like with same length as frame + + Returns: + -------- + A dataframe/series/list/scalar form of normalized other + """ + if isinstance(frame, DataFrame) and isinstance(other, DataFrame): + source_df = frame.copy(deep=False) + other_df = other.copy(deep=False) + for self_col in source_df._column_names: + source_col, other_col = _check_and_cast_columns_with_other( + source_col=source_df._data[self_col], + other=other_df._data[self_col], + inplace=inplace, + ) + source_df._data[self_col] = source_col + other_df._data[self_col] = other_col + return source_df, other_df + + elif isinstance( + frame, (Series, Index) + ) and not cudf.utils.dtypes.is_scalar(other): + other = cudf.core.column.as_column(other) + input_col = frame._data[frame.name] + return _check_and_cast_columns_with_other( + source_col=input_col, other=other, inplace=inplace + ) + else: + # Handles scalar or list/array like scalars + if isinstance(frame, (Series, Index)) and cudf.utils.dtypes.is_scalar( + other + ): + input_col = frame._data[frame.name] + return _check_and_cast_columns_with_other( + source_col=frame._data[frame.name], + other=other, + inplace=inplace, + ) + + elif isinstance(frame, DataFrame): + if cudf.utils.dtypes.is_scalar(other): + other = [other for i in range(len(frame._column_names))] + + source_df = frame.copy(deep=False) + others = [] + for col_name, other_sclr in zip(frame._column_names, other): + + ( + source_col, + other_scalar, + ) = _check_and_cast_columns_with_other( + source_col=source_df._data[col_name], + other=other_sclr, + inplace=inplace, + ) + source_df._data[col_name] = source_col + others.append(other_scalar) + return source_df, others + else: + raise ValueError( + f"Inappropriate input {type(frame)} " + f"and other {type(other)} combination" + ) + + +def where( + frame: Union[Series, Index, DataFrame], + cond: Any, + other: Any = None, + inplace: bool = False, +) -> Optional[Union[Frame]]: + """ + Replace values where the condition is False. + + Parameters + ---------- + cond : bool Series/DataFrame, array-like + Where cond is True, keep the original value. + Where False, replace with corresponding value from other. + Callables are not supported. + other: scalar, list of scalars, Series/DataFrame + Entries where cond is False are replaced with + corresponding value from other. Callables are not + supported. Default is None. + + DataFrame expects only Scalar or array like with scalars or + dataframe with same dimension as frame. + + Series expects only scalar or series like with same length + inplace : bool, default False + Whether to perform the operation in place on the data. + + Returns + ------- + Same type as caller + + Examples + -------- + >>> import cudf + >>> df = DataFrame({"A":[1, 4, 5], "B":[3, 5, 8]}) + >>> df.where(df % 2 == 0, [-1, -1]) + A B + 0 -1 -1 + 1 4 -1 + 2 -1 8 + + >>> ser = Series([4, 3, 2, 1, 0]) + >>> ser.where(ser > 2, 10) + 0 4 + 1 3 + 2 10 + 3 10 + 4 10 + dtype: int64 + >>> ser.where(ser > 2) + 0 4 + 1 3 + 2 + 3 + 4 + dtype: int64 + """ + + if isinstance(frame, DataFrame): + if hasattr(cond, "__cuda_array_interface__"): + cond = DataFrame( + cond, columns=frame._column_names, index=frame.index + ) + elif ( + hasattr(cond, "__array_interface__") + and cond.__array_interface__["shape"] != frame.shape + ): + raise ValueError("conditional must be same shape as self") + elif not isinstance(cond, DataFrame): + cond = frame.from_pandas(pd.DataFrame(cond)) + + common_cols = set(frame._column_names).intersection( + set(cond._column_names) + ) + if len(common_cols) > 0: + # If `frame` and `cond` are having unequal index, + # then re-index `cond`. + if not frame.index.equals(cond.index): + cond = cond.reindex(frame.index) + else: + if cond.shape != frame.shape: + raise ValueError( + """Array conditional must be same shape as self""" + ) + # Setting `frame` column names to `cond` + # as `cond` has no column names. + cond.columns = frame.columns + + (source_df, others,) = _normalize_columns_and_scalars_type( + frame, other + ) + if isinstance(other, Frame): + others = others._data.columns + + out_df = DataFrame(index=frame.index) + if len(frame._columns) != len(others): + raise ValueError( + """Replacement list length or number of dataframe columns + should be equal to Number of columns of dataframe""" + ) + for i, column_name in enumerate(frame._column_names): + input_col = source_df._data[column_name] + other_column = others[i] + if column_name in cond._data: + if isinstance(input_col, cudf.core.column.CategoricalColumn): + if cudf.utils.dtypes.is_scalar(other_column): + try: + other_column = input_col._encode(other_column) + except ValueError: + # When other is not present in categories, + # fill with Null. + other_column = None + other_column = cudf.Scalar( + other_column, dtype=input_col.codes.dtype + ) + elif isinstance( + other_column, cudf.core.column.CategoricalColumn + ): + other_column = other_column.codes + input_col = input_col.codes + + result = cudf._lib.copying.copy_if_else( + input_col, other_column, cond._data[column_name] + ) + + if isinstance( + frame._data[column_name], + cudf.core.column.CategoricalColumn, + ): + result = cudf.core.column.build_categorical_column( + categories=frame._data[column_name].categories, + codes=cudf.core.column.as_column( + result.base_data, dtype=result.dtype + ), + mask=result.base_mask, + size=result.size, + offset=result.offset, + ordered=frame._data[column_name].ordered, + ) + else: + out_mask = cudf._lib.null_mask.create_null_mask( + len(input_col), + state=cudf._lib.null_mask.MaskState.ALL_NULL, + ) + result = input_col.set_mask(out_mask) + out_df[column_name] = frame[column_name].__class__(result) + + return frame._mimic_inplace(out_df, inplace=inplace) + + else: + if isinstance(other, DataFrame): + raise NotImplementedError( + "cannot align with a higher dimensional Frame" + ) + input_col = frame._data[frame.name] + cond = cudf.core.column.as_column(cond) + if len(cond) != len(frame): + raise ValueError( + """Array conditional must be same shape as self""" + ) + + (input_col, other,) = _normalize_columns_and_scalars_type( + frame, other, inplace + ) + + if isinstance(input_col, cudf.core.column.CategoricalColumn): + if cudf.utils.dtypes.is_scalar(other): + try: + other = input_col._encode(other) + except ValueError: + # When other is not present in categories, + # fill with Null. + other = None + other = cudf.Scalar(other, dtype=input_col.codes.dtype) + elif isinstance(other, cudf.core.column.CategoricalColumn): + other = other.codes + + input_col = input_col.codes + + result = cudf._lib.copying.copy_if_else(input_col, other, cond) + + if isinstance( + frame._data[frame.name], cudf.core.column.CategoricalColumn + ): + result = cudf.core.column.build_categorical_column( + categories=cast( + cudf.core.column.CategoricalColumn, + frame._data[frame.name], + ).categories, + codes=cudf.core.column.as_column( + result.base_data, dtype=result.dtype + ), + mask=result.base_mask, + size=result.size, + offset=result.offset, + ordered=cast( + cudf.core.column.CategoricalColumn, + frame._data[frame.name], + ).ordered, + ) + + if isinstance(frame, Index): + result = Index(result, name=frame.name) + else: + result = frame._copy_construct(data=result) + + return frame._mimic_inplace(result, inplace=inplace) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 01b96151485..6639fc7c25c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1658,8 +1658,9 @@ def update( if not self.index.equals(other.index): other = other.reindex(self.index, axis=0) - for col in self.columns: - this = self[col] + source_df = self.copy(deep=False) + for col in source_df._column_names: + this = source_df[col] that = other[col] if errors == "raise": @@ -1676,8 +1677,9 @@ def update( # don't overwrite columns unnecessarily if mask.all(): continue + source_df[col] = source_df[col].where(mask, that) - self[col] = this.where(mask, that) + self._mimic_inplace(source_df, inplace=True) def __add__(self, other): return self._apply_op("__add__", other) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index fb746d6c794..bc43c367833 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,7 +6,7 @@ import functools import warnings from collections import OrderedDict, abc as abc -from typing import TYPE_CHECKING, Any, Dict, Tuple, TypeVar, Union, overload +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, TypeVar, Union import cupy import numpy as np @@ -14,7 +14,6 @@ import pyarrow as pa from nvtx import annotate from pandas.api.types import is_dict_like, is_dtype_equal -from typing_extensions import Literal import cudf from cudf import _lib as libcudf @@ -53,19 +52,9 @@ class Frame(libcudf.table.Table): def _from_table(cls, table: Frame): return cls(table._data, index=table._index) - @overload - def _mimic_inplace(self, result: Frame) -> Frame: - ... - - @overload - def _mimic_inplace(self, result: Frame, inplace: Literal[True]): - ... - - @overload - def _mimic_inplace(self, result: Frame, inplace: Literal[False]) -> Frame: - ... - - def _mimic_inplace(self, result, inplace=False): + def _mimic_inplace( + self: T, result: Frame, inplace: bool = False + ) -> Optional[Frame]: if inplace: for col in self._data: if col in result._data: @@ -74,6 +63,7 @@ def _mimic_inplace(self, result, inplace=False): ) self._data = result._data self._index = result._index + return None else: return result @@ -796,87 +786,6 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) - def _normalize_scalars(self, other): - """ - Try to normalizes scalar values as per self dtype - """ - if ( - other is not None - and (isinstance(other, float) and not np.isnan(other)) - ) and (self.dtype.type(other) != other): - raise TypeError( - f"Cannot safely cast non-equivalent " - f"{type(other).__name__} to {self.dtype.name}" - ) - - return ( - self.dtype.type(other) - if ( - other is not None - and (isinstance(other, float) and not np.isnan(other)) - ) - else other - ) - - def _normalize_columns_and_scalars_type(self, other): - """ - Try to normalize the other's dtypes as per self. - - Parameters - ---------- - - self : Can be a DataFrame or Series or Index - other : Can be a DataFrame, Series, Index, Array - like object or a scalar value - - if self is DataFrame, other can be only a - scalar or array like with size of number of columns - in DataFrame or a DataFrame with same dimension - - if self is Series, other can be only a scalar or - a series like with same length as self - - Returns: - -------- - A dataframe/series/list/scalar form of normalized other - """ - if isinstance(self, cudf.DataFrame) and isinstance( - other, cudf.DataFrame - ): - return [ - other[self_col].astype(self._data[self_col].dtype)._column - for self_col in self._data.names - ] - - elif isinstance(self, (cudf.Series, cudf.Index)) and not is_scalar( - other - ): - other = as_column(other) - return other.astype(self.dtype) - - else: - # Handles scalar or list/array like scalars - if isinstance(self, (cudf.Series, cudf.Index)) and is_scalar( - other - ): - return self._normalize_scalars(other) - - elif isinstance(self, cudf.DataFrame): - out = [] - if is_scalar(other): - other = [other for i in range(len(self._data.names))] - out = [ - self[in_col_name]._normalize_scalars(sclr) - for in_col_name, sclr in zip(self._data.names, other) - ] - - return out - else: - raise ValueError( - f"Inappropriate input {type(self)} " - f"and other {type(other)} combination" - ) - def where(self, cond, other=None, inplace=False): """ Replace values where the condition is False. @@ -930,133 +839,9 @@ def where(self, cond, other=None, inplace=False): dtype: int64 """ - if isinstance(self, cudf.DataFrame): - if hasattr(cond, "__cuda_array_interface__"): - cond = cudf.DataFrame( - cond, columns=self._data.names, index=self.index - ) - elif not isinstance(cond, cudf.DataFrame): - cond = self.from_pandas(pd.DataFrame(cond)) - - common_cols = set(self._data.names).intersection( - set(cond._data.names) - ) - if len(common_cols) > 0: - # If `self` and `cond` are having unequal index, - # then re-index `cond`. - if not self.index.equals(cond.index): - cond = cond.reindex(self.index) - else: - if cond.shape != self.shape: - raise ValueError( - """Array conditional must be same shape as self""" - ) - # Setting `self` column names to `cond` - # as `cond` has no column names. - cond.columns = self.columns - - other = self._normalize_columns_and_scalars_type(other) - out_df = cudf.DataFrame(index=self.index) - if len(self._columns) != len(other): - raise ValueError( - """Replacement list length or number of dataframe columns - should be equal to Number of columns of dataframe""" - ) - - for column_name, other_column in zip(self._data.names, other): - input_col = self._data[column_name] - if column_name in cond._data: - if isinstance( - input_col, cudf.core.column.CategoricalColumn - ): - if np.isscalar(other_column): - try: - other_column = input_col._encode(other_column) - except ValueError: - # When other is not present in categories, - # fill with Null. - other_column = None - elif hasattr(other_column, "codes"): - other_column = other_column.codes - input_col = input_col.codes - - result = libcudf.copying.copy_if_else( - input_col, other_column, cond._data[column_name] - ) - - if isinstance( - self._data[column_name], - cudf.core.column.CategoricalColumn, - ): - result = build_categorical_column( - categories=self._data[column_name].categories, - codes=as_column( - result.base_data, dtype=result.dtype - ), - mask=result.base_mask, - size=result.size, - offset=result.offset, - ordered=self._data[column_name].ordered, - ) - else: - from cudf._lib.null_mask import MaskState, create_null_mask - - out_mask = create_null_mask( - len(input_col), state=MaskState.ALL_NULL - ) - result = input_col.set_mask(out_mask) - out_df[column_name] = self[column_name].__class__(result) - - return self._mimic_inplace(out_df, inplace=inplace) - - else: - - if isinstance(other, cudf.DataFrame): - raise NotImplementedError( - "cannot align with a higher dimensional Frame" - ) - - other = self._normalize_columns_and_scalars_type(other) - - cond = as_column(cond) - if len(cond) != len(self): - raise ValueError( - """Array conditional must be same shape as self""" - ) - input_col = self._data[self.name] - if isinstance(input_col, cudf.core.column.CategoricalColumn): - if np.isscalar(other): - try: - other = input_col._encode(other) - except ValueError: - # When other is not present in categories, - # fill with Null. - other = None - elif hasattr(other, "codes"): - other = other.codes - - input_col = input_col.codes - - result = libcudf.copying.copy_if_else(input_col, other, cond) - - if is_categorical_dtype(self.dtype): - result = build_categorical_column( - categories=self._data[self.name].categories, - codes=as_column(result.base_data, dtype=result.dtype), - mask=result.base_mask, - size=result.size, - offset=result.offset, - ordered=self._data[self.name].ordered, - ) - - if isinstance(self, cudf.Index): - from cudf.core.index import as_index - - result = as_index(result, name=self.name) - else: - result = self._copy_construct(data=result) - - return self._mimic_inplace(result, inplace=inplace) + return cudf.core._internals.where( + frame=self, cond=cond, other=other, inplace=inplace + ) def mask(self, cond, other=None, inplace=False): """ @@ -2735,7 +2520,6 @@ def searchsorted( array([4, 4, 4, 0], dtype=int32) """ # Call libcudf++ search_sorted primitive - from cudf.utils.dtypes import is_scalar scalar_flag = None if is_scalar(values): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 71a4a48a07a..955519d0b57 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3923,6 +3923,110 @@ def replace( return self._mimic_inplace(result, inplace=inplace) + def update(self, other): + """ + Modify Series in place using values from passed Series. + Uses non-NA values from passed Series to make updates. Aligns + on index. + + Parameters + ---------- + other : Series, or object coercible into Series + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update(cudf.Series([4, 5, 6])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: int64 + >>> s = cudf.Series(['a', 'b', 'c']) + >>> s + 0 a + 1 b + 2 c + dtype: object + >>> s.update(cudf.Series(['d', 'e'], index=[0, 2])) + >>> s + 0 d + 1 b + 2 e + dtype: object + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update(cudf.Series([4, 5, 6, 7, 8])) + >>> s + 0 4 + 1 5 + 2 6 + dtype: int64 + + If ``other`` contains NaNs the corresponding values are not updated + in the original Series. + + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update(cudf.Series([4, np.nan, 6], nan_as_null=False)) + >>> s + 0 4 + 1 2 + 2 6 + dtype: int64 + + ``other`` can also be a non-Series object type + that is coercible into a Series + + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update([4, np.nan, 6]) + >>> s + 0 4 + 1 2 + 2 6 + dtype: int64 + >>> s = cudf.Series([1, 2, 3]) + >>> s + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> s.update({1: 9}) + >>> s + 0 1 + 1 9 + 2 3 + dtype: int64 + """ + + if not isinstance(other, cudf.Series): + other = cudf.Series(other) + + if not self.index.equals(other.index): + other = other.reindex(index=self.index) + mask = other.notna() + + self.mask(mask, other, inplace=True) + def reverse(self): """ Reverse the Series diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d72b88f1713..f068d02d575 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8215,9 +8215,6 @@ def test_agg_for_dataframe_with_string_columns(aggs): @pytest.mark.parametrize( "overwrite", [True, False], ) -@pytest.mark.parametrize( - "filter_func", [None], -) @pytest.mark.parametrize( "errors", ["ignore"], ) @@ -8262,19 +8259,17 @@ def test_agg_for_dataframe_with_string_columns(aggs): }, ], ) -def test_update_for_dataframes( - data, data2, join, overwrite, filter_func, errors -): +def test_update_for_dataframes(data, data2, join, overwrite, errors): pdf = pd.DataFrame(data) gdf = cudf.DataFrame(data) other_pd = pd.DataFrame(data2) other_gd = cudf.DataFrame(data2) - expect = pdf.update(other_pd, join, overwrite, filter_func, errors) - got = gdf.update(other_gd, join, overwrite, filter_func, errors) + pdf.update(other=other_pd, join=join, overwrite=overwrite, errors=errors) + gdf.update(other=other_gd, join=join, overwrite=overwrite, errors=errors) - assert_eq(expect, got) + assert_eq(pdf, gdf, check_dtype=False) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py index e7baa4ee926..65ce2a79992 100644 --- a/python/cudf/cudf/tests/test_replace.py +++ b/python/cudf/cudf/tests/test_replace.py @@ -709,25 +709,40 @@ def test_series_where(data_dtype, fill_value): sr.where(sr > 0, fill_value) else: # Cast back to original dtype as pandas automatically upcasts - expect = psr.where(psr > 0, fill_value).astype(psr.dtype) + expect = psr.where(psr > 0, fill_value) got = sr.where(sr > 0, fill_value) - assert_eq(expect, got) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=False if expect.dtype.kind in ("f") else True, + ) if sr.dtype.type(fill_value) != fill_value: with pytest.raises(TypeError): sr.where(sr < 0, fill_value) else: - expect = psr.where(psr < 0, fill_value).astype(psr.dtype) + expect = psr.where(psr < 0, fill_value) got = sr.where(sr < 0, fill_value) - assert_eq(expect, got) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=False if expect.dtype.kind in ("f") else True, + ) if sr.dtype.type(fill_value) != fill_value: with pytest.raises(TypeError): sr.where(sr == 0, fill_value) else: - expect = psr.where(psr == 0, fill_value).astype(psr.dtype) + expect = psr.where(psr == 0, fill_value) got = sr.where(sr == 0, fill_value) - assert_eq(expect, got) + # pandas returns 'float16' dtype, which is not supported in cudf + assert_eq( + expect, + got, + check_dtype=False if expect.dtype.kind in ("f") else True, + ) @pytest.mark.parametrize("fill_value", [100, 100.0, 100.5]) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index beda14934ca..0dc53fa29e9 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -921,6 +921,42 @@ def custom_add_func(sr, val): ) +@pytest.mark.parametrize( + "data", + [cudf.Series([1, 2, 3]), cudf.Series([10, 11, 12], index=[1, 2, 3])], +) +@pytest.mark.parametrize( + "other", + [ + cudf.Series([4, 5, 6]), + cudf.Series([4, 5, 6, 7, 8]), + cudf.Series([4, np.nan, 6], nan_as_null=False), + [4, np.nan, 6], + {1: 9}, + ], +) +def test_series_update(data, other): + gs = data.copy(deep=True) + if isinstance(other, cudf.Series): + g_other = other.copy(deep=True) + p_other = g_other.to_pandas() + else: + g_other = other + p_other = other + + ps = gs.to_pandas() + + gs_column_before = gs._column + gs.update(g_other) + gs_column_after = gs._column + + assert_eq(gs_column_before.to_array(), gs_column_after.to_array()) + + ps.update(p_other) + + assert_eq(gs, ps) + + @pytest.mark.parametrize( "data", [ @@ -942,6 +978,19 @@ def test_fillna_with_nan(data, nan_as_null, fill_value): assert_eq(expected, actual) +def test_series_mask_mixed_dtypes_error(): + s = cudf.Series(["a", "b", "c"]) + with pytest.raises( + TypeError, + match=re.escape( + "cudf does not support mixed types, please type-cast " + "the column of dataframe/series and other " + "to same dtypes." + ), + ): + s.where([True, False, True], [1, 2, 3]) + + @pytest.mark.parametrize( "ps", [ diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 8af225ecb58..be2b1bca2e0 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -345,7 +345,7 @@ def to_cudf_compatible_scalar(val, dtype=None): if not is_scalar(val): raise ValueError( f"Cannot convert value of type {type(val).__name__} " - " to cudf scalar" + "to cudf scalar" ) if isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0: @@ -637,6 +637,11 @@ def find_common_type(dtypes): # Aggregate same types dtypes = set(dtypes) + if any(is_decimal_dtype(dtype) for dtype in dtypes): + raise NotImplementedError( + "DecimalDtype is not yet supported in find_common_type" + ) + # Corner case 1: # Resort to np.result_type to handle "M" and "m" types separately dt_dtypes = set(filter(lambda t: is_datetime_dtype(t), dtypes)) @@ -651,7 +656,64 @@ def find_common_type(dtypes): dtypes = dtypes - td_dtypes dtypes.add(np.result_type(*td_dtypes)) - return np.find_common_type(list(dtypes), []) + common_dtype = np.find_common_type(list(dtypes), []) + if common_dtype == np.dtype("float16"): + # cuDF does not support float16 dtype + return np.dtype("float32") + else: + return common_dtype + + +def _can_cast(from_dtype, to_dtype): + """ + Utility function to determine if we can cast + from `from_dtype` to `to_dtype`. This function primarily calls + `np.can_cast` but with some special handling around + cudf specific dtypes. + """ + if isinstance(from_dtype, type): + from_dtype = np.dtype(from_dtype) + if isinstance(to_dtype, type): + to_dtype = np.dtype(to_dtype) + + # TODO : Add precision & scale checking for + # decimal types in future + if isinstance(from_dtype, cudf.core.dtypes.Decimal64Dtype): + if isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype): + return True + elif isinstance(to_dtype, np.dtype): + if to_dtype.kind in {"i", "f", "u", "U", "O"}: + return True + else: + return False + elif isinstance(from_dtype, np.dtype): + if isinstance(to_dtype, np.dtype): + return np.can_cast(from_dtype, to_dtype) + elif isinstance(to_dtype, cudf.core.dtypes.Decimal64Dtype): + if from_dtype.kind in {"i", "f", "u", "U", "O"}: + return True + else: + return False + elif isinstance(to_dtype, cudf.core.types.CategoricalDtype): + return True + else: + return False + elif isinstance(from_dtype, cudf.core.dtypes.ListDtype): + # TODO: Add level based checks too once casting of + # list columns is supported + if isinstance(to_dtype, cudf.core.dtypes.ListDtype): + return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type) + else: + return False + elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype): + if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype): + return True + elif isinstance(to_dtype, np.dtype): + return np.can_cast(from_dtype._categories.dtype, to_dtype) + else: + return False + else: + return np.can_cast(from_dtype, to_dtype) # Type dispatch loops similar to what are found in `np.add.types` From 157b6a883dfe390c329ce0d497e668dac3e736c9 Mon Sep 17 00:00:00 2001 From: Ray Douglass <3107146+raydouglass@users.noreply.github.com> Date: Wed, 31 Mar 2021 18:04:22 -0400 Subject: [PATCH 12/59] Revert "Update conda recipes pinning of repo dependencies (#7743)" (#7793) This reverts commit 7d49f75df9681dbe1653029e7d508355884a6d86 - #7743. Authors: - Ray Douglass (https://github.com/raydouglass) Approvers: - Mike Wendt (https://github.com/mike-wendt) URL: https://github.com/rapidsai/cudf/pull/7793 --- conda/recipes/cudf/meta.yaml | 2 +- conda/recipes/cudf_kafka/meta.yaml | 8 ++++---- conda/recipes/custreamz/meta.yaml | 8 ++++---- conda/recipes/dask-cudf/meta.yaml | 6 +++--- conda/recipes/libcudf_kafka/meta.yaml | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index a119040bbcf..5635f54ba20 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -28,7 +28,7 @@ requirements: - numba >=0.49.0 - dlpack - pyarrow 1.0.1 - - libcudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - libcudf {{ version }} - rmm {{ minor_version }} - cudatoolkit {{ cuda_version }} run: diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index cc3f30091bf..0acd9ec4bb2 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -29,12 +29,12 @@ requirements: - python - cython >=0.29,<0.30 - setuptools - - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} - - libcudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - cudf {{ version }} + - libcudf_kafka {{ version }} run: - - libcudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - libcudf_kafka {{ version }} - python-confluent-kafka - - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - cudf {{ version }} test: requires: diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index 8edca7a51d0..ffda6d0c3c6 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -23,15 +23,15 @@ requirements: host: - python - python-confluent-kafka - - cudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - cudf_kafka {{ version }} run: - python - - streamz - - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - streamz + - cudf {{ version }} - dask >=2.22.0 - distributed >=2.22.0 - python-confluent-kafka - - cudf_kafka {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - cudf_kafka {{ version }} test: requires: diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index a8768e26056..66bffdfd61e 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -22,15 +22,15 @@ build: requirements: host: - python - - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - cudf {{ version }} - dask>=2021.3.1 - distributed >=2.22.0 run: - python - - cudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - cudf {{ version }} - dask>=2021.3.1 - distributed >=2.22.0 - + test: requires: - cudatoolkit {{ cuda_version }}.* diff --git a/conda/recipes/libcudf_kafka/meta.yaml b/conda/recipes/libcudf_kafka/meta.yaml index 81ff922b8d7..5348ec471e9 100644 --- a/conda/recipes/libcudf_kafka/meta.yaml +++ b/conda/recipes/libcudf_kafka/meta.yaml @@ -25,7 +25,7 @@ requirements: build: - cmake >=3.17.0 host: - - libcudf {{ version }}=*_{{ GIT_DESCRIBE_NUMBER }} + - libcudf {{ version }} - librdkafka >=1.5.0,<1.5.3 run: - {{ pin_compatible('librdkafka', max_pin='x.x') }} #TODO: librdkafka should be automatically included here by run_exports but is not From e379ab1a363fa25a7e2f811908f4186214a76e1f Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 31 Mar 2021 17:23:48 -0500 Subject: [PATCH 13/59] Implement DecimalColumn + Scalar and add cudf.Scalars of Decimal64Dtype (#7732) Closes https://github.com/rapidsai/cudf/issues/7680 Authors: - https://github.com/brandon-b-miller - Michael Wang (https://github.com/isVoid) Approvers: - Keith Kraus (https://github.com/kkraus14) URL: https://github.com/rapidsai/cudf/pull/7732 --- python/cudf/cudf/_lib/copying.pyx | 4 +- python/cudf/cudf/_lib/cpp/scalar/scalar.pxd | 9 + .../cudf/cudf/_lib/cpp/wrappers/decimals.pxd | 8 + python/cudf/cudf/_lib/scalar.pxd | 3 +- python/cudf/cudf/_lib/scalar.pyx | 65 +++++- python/cudf/cudf/core/column/decimal.py | 8 + python/cudf/cudf/core/dtypes.py | 9 + python/cudf/cudf/core/scalar.py | 49 +++-- python/cudf/cudf/tests/test_binops.py | 187 ++++++++++++++++++ python/cudf/cudf/tests/test_scalar.py | 77 +++++++- python/cudf/cudf/utils/dtypes.py | 22 +++ 11 files changed, 405 insertions(+), 36 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index 8f93866612e..548e16155dd 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -685,7 +685,9 @@ def get_element(Column input_column, size_type index): cpp_copying.get_element(col_view, index) ) - return DeviceScalar.from_unique_ptr(move(c_output)) + return DeviceScalar.from_unique_ptr( + move(c_output), dtype=input_column.dtype + ) def sample(Table input, size_type n, diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd index 3eb11c2bfd0..fec1c6382e6 100644 --- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd +++ b/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd @@ -7,6 +7,7 @@ from libcpp cimport bool from libcpp.string cimport string from cudf._lib.cpp.types cimport data_type +from cudf._lib.cpp.wrappers.decimals cimport scale_type cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: cdef cppclass scalar: @@ -51,3 +52,11 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil: string_scalar(string st, bool is_valid) except + string_scalar(string_scalar other) except + string to_string() except + + + cdef cppclass fixed_point_scalar[T](scalar): + fixed_point_scalar() except + + fixed_point_scalar(int64_t value, + scale_type scale, + bool is_valid) except + + int64_t value() except + + # TODO: Figure out how to add an int32 overload of value() diff --git a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd new file mode 100644 index 00000000000..a73e6e0151d --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. +from libc.stdint cimport int64_t, int32_t + +cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil: + ctypedef int64_t decimal64 + + cdef cppclass scale_type: + scale_type(int32_t) diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index d44bac0e435..2fafe0f2c67 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -8,10 +8,11 @@ from cudf._lib.cpp.scalar.scalar cimport scalar cdef class DeviceScalar: cdef unique_ptr[scalar] c_value + cdef object _dtype cdef const scalar* get_raw_ptr(self) except * @staticmethod - cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr) + cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=*) cpdef bool is_valid(DeviceScalar s) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index a5945bc72f0..b31f0675422 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,5 +1,5 @@ # Copyright (c) 2020, NVIDIA CORPORATION. - +import decimal import numpy as np import pandas as pd @@ -34,15 +34,19 @@ from cudf._lib.cpp.wrappers.durations cimport( duration_us, duration_ns ) +from cudf._lib.cpp.wrappers.decimals cimport decimal64, scale_type from cudf._lib.cpp.scalar.scalar cimport ( scalar, numeric_scalar, timestamp_scalar, duration_scalar, - string_scalar + string_scalar, + fixed_point_scalar ) +from cudf.utils.dtypes import _decimal_to_int64 cimport cudf._lib.cpp.types as libcudf_types + cdef class DeviceScalar: def __init__(self, value, dtype): @@ -59,14 +63,17 @@ cdef class DeviceScalar: dtype : dtype A NumPy dtype. """ - - self._set_value(value, dtype) + self._dtype = dtype if dtype.kind != 'U' else np.dtype('object') + self._set_value(value, self._dtype) def _set_value(self, value, dtype): # IMPORTANT: this should only ever be called from __init__ valid = not _is_null_host_scalar(value) - if pd.api.types.is_string_dtype(dtype): + if isinstance(dtype, cudf.Decimal64Dtype): + _set_decimal64_from_scalar( + self.c_value, value, dtype, valid) + elif pd.api.types.is_string_dtype(dtype): _set_string_from_np_string(self.c_value, value, valid) elif pd.api.types.is_numeric_dtype(dtype): _set_numeric_from_np_scalar(self.c_value, @@ -88,7 +95,9 @@ cdef class DeviceScalar: ) def _to_host_scalar(self): - if pd.api.types.is_string_dtype(self.dtype): + if isinstance(self.dtype, cudf.Decimal64Dtype): + result = _get_py_decimal_from_fixed_point(self.c_value) + elif pd.api.types.is_string_dtype(self.dtype): result = _get_py_string_from_string(self.c_value) elif pd.api.types.is_numeric_dtype(self.dtype): result = _get_np_scalar_from_numeric(self.c_value) @@ -108,8 +117,7 @@ cdef class DeviceScalar: The NumPy dtype corresponding to the data type of the underlying device scalar. """ - cdef libcudf_types.data_type cdtype = self.get_raw_ptr()[0].type() - return cudf_to_np_types[(cdtype.id())] + return self._dtype @property def value(self): @@ -137,13 +145,27 @@ cdef class DeviceScalar: return f"{self.__class__.__name__}({self.value.__repr__()})" @staticmethod - cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr): + cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=None): """ Construct a Scalar object from a unique_ptr. """ cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) + cdef libcudf_types.data_type cdtype + s.c_value = move(ptr) + cdtype = s.get_raw_ptr()[0].type() + if cdtype.id() == libcudf_types.DECIMAL64 and dtype is None: + raise TypeError( + "Must pass a dtype when constructing from a fixed-point scalar" + ) + else: + if dtype is not None: + s._dtype = dtype + else: + s._dtype = cudf_to_np_types[ + (cdtype.id()) + ] return s @@ -235,6 +257,17 @@ cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s, else: raise ValueError(f"dtype not supported: {dtype}") +cdef _set_decimal64_from_scalar(unique_ptr[scalar]& s, + object value, + object dtype, + bool valid=True): + value = _decimal_to_int64(value) if valid else 0 + s.reset( + new fixed_point_scalar[decimal64]( + np.int64(value), scale_type(-dtype.scale), valid + ) + ) + cdef _get_py_string_from_string(unique_ptr[scalar]& s): if not s.get()[0].is_valid(): return cudf.NA @@ -274,6 +307,20 @@ cdef _get_np_scalar_from_numeric(unique_ptr[scalar]& s): raise ValueError("Could not convert cudf::scalar to numpy scalar") +cdef _get_py_decimal_from_fixed_point(unique_ptr[scalar]& s): + cdef scalar* s_ptr = s.get() + if not s_ptr[0].is_valid(): + return cudf.NA + + cdef libcudf_types.data_type cdtype = s_ptr[0].type() + + if cdtype.id() == libcudf_types.DECIMAL64: + rep_val = int((s_ptr)[0].value()) + scale = int((s_ptr)[0].type().scale()) + return decimal.Decimal(rep_val).scaleb(scale) + else: + raise ValueError("Could not convert cudf::scalar to numpy scalar") + cdef _get_np_scalar_from_timestamp64(unique_ptr[scalar]& s): cdef scalar* s_ptr = s.get() diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 96e09a5abb5..971d849d970 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -17,6 +17,8 @@ from_decimal as cpp_from_decimal, ) from cudf.core.column import as_column +from decimal import Decimal +from cudf.utils.dtypes import is_scalar class DecimalColumn(ColumnBase): @@ -71,6 +73,12 @@ def binary_operator(self, op, other, reflect=False): result.dtype.precision = _binop_precision(self.dtype, other.dtype, op) return result + def normalize_binop_value(self, other): + if is_scalar(other) and isinstance(other, (int, np.int, Decimal)): + return cudf.Scalar(Decimal(other)) + else: + raise TypeError(f"cannot normalize {type(other)}") + def _apply_scan_op(self, op: str) -> ColumnBase: return libcudf.reduce.scan(op, self, True) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index a18aad3872b..0bde2eb551f 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -311,6 +311,15 @@ def _validate(cls, precision, scale=0): if abs(scale) > precision: raise ValueError(f"scale={scale} exceeds precision={precision}") + @classmethod + def _from_decimal(cls, decimal): + """ + Create a cudf.Decimal64Dtype from a decimal.Decimal object + """ + metadata = decimal.as_tuple() + precision = max(len(metadata.digits), -metadata.exponent) + return cls(precision, -metadata.exponent) + class IntervalDtype(StructDtype): name = "interval" diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 1e998ae37e2..5514e655211 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -1,9 +1,11 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +import decimal import numpy as np from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar from cudf.core.column.column import ColumnBase +from cudf.core.dtypes import Decimal64Dtype from cudf.core.index import Index from cudf.core.series import Series from cudf.utils.dtypes import ( @@ -112,29 +114,44 @@ def _device_value_to_host(self): self._host_value = self._device_value._to_host_scalar() def _preprocess_host_value(self, value, dtype): + if isinstance(dtype, Decimal64Dtype): + # TODO: Support coercion from decimal.Decimal to different dtype + # TODO: Support coercion from integer to Decimal64Dtype + raise NotImplementedError( + "dtype as cudf.Decimal64Dtype is not supported. Pass a " + "decimal.Decimal to construct a DecimalScalar." + ) + if isinstance(value, decimal.Decimal) and dtype is not None: + raise TypeError(f"Can not coerce decimal to {dtype}") + value = to_cudf_compatible_scalar(value, dtype=dtype) valid = not _is_null_host_scalar(value) - if dtype is None: - if not valid: - if isinstance(value, (np.datetime64, np.timedelta64)): - unit, _ = np.datetime_data(value) - if unit == "generic": + if isinstance(value, decimal.Decimal): + # 0.0042 -> Decimal64Dtype(2, 4) + dtype = Decimal64Dtype._from_decimal(value) + + else: + if dtype is None: + if not valid: + if isinstance(value, (np.datetime64, np.timedelta64)): + unit, _ = np.datetime_data(value) + if unit == "generic": + raise TypeError( + "Cant convert generic NaT to null scalar" + ) + else: + dtype = value.dtype + else: raise TypeError( - "Cant convert generic NaT to null scalar" + "dtype required when constructing a null scalar" ) - else: - dtype = value.dtype else: - raise TypeError( - "dtype required when constructing a null scalar" - ) - else: - dtype = value.dtype - dtype = np.dtype(dtype) + dtype = value.dtype + dtype = np.dtype(dtype) - # temporary - dtype = np.dtype("object") if dtype.char == "U" else dtype + # temporary + dtype = np.dtype("object") if dtype.char == "U" else dtype if not valid: value = NA diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index eb8aaaadd51..2e2992fc524 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1773,6 +1773,193 @@ def decimal_series(input, dtype): utils.assert_eq(expect, got) +@pytest.mark.parametrize( + "args", + [ + ( + operator.add, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + decimal.Decimal(1), + ["101", "201"], + cudf.Decimal64Dtype(scale=0, precision=6), + False, + ), + ( + operator.add, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + 1, + ["101", "201"], + cudf.Decimal64Dtype(scale=0, precision=6), + False, + ), + ( + operator.add, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + decimal.Decimal("1.5"), + ["101.5", "201.5"], + cudf.Decimal64Dtype(scale=1, precision=7), + False, + ), + ( + operator.add, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + decimal.Decimal(1), + ["101", "201"], + cudf.Decimal64Dtype(scale=0, precision=6), + True, + ), + ( + operator.add, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + 1, + ["101", "201"], + cudf.Decimal64Dtype(scale=0, precision=6), + True, + ), + ( + operator.add, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + decimal.Decimal("1.5"), + ["101.5", "201.5"], + cudf.Decimal64Dtype(scale=1, precision=7), + True, + ), + ( + operator.mul, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + 1, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=5), + False, + ), + ( + operator.mul, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + decimal.Decimal(2), + ["200", "400"], + cudf.Decimal64Dtype(scale=-2, precision=5), + False, + ), + ( + operator.mul, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + decimal.Decimal("1.5"), + ["150", "300"], + cudf.Decimal64Dtype(scale=-1, precision=6), + False, + ), + ( + operator.mul, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + 1, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=5), + True, + ), + ( + operator.mul, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + decimal.Decimal(2), + ["200", "400"], + cudf.Decimal64Dtype(scale=-2, precision=5), + True, + ), + ( + operator.mul, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + decimal.Decimal("1.5"), + ["150", "300"], + cudf.Decimal64Dtype(scale=-1, precision=6), + True, + ), + ( + operator.sub, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + decimal.Decimal(2), + ["98", "198"], + cudf.Decimal64Dtype(scale=0, precision=6), + False, + ), + ( + operator.sub, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + decimal.Decimal("2.5"), + ["97.5", "197.5"], + cudf.Decimal64Dtype(scale=1, precision=7), + False, + ), + ( + operator.sub, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + 4, + ["96", "196"], + cudf.Decimal64Dtype(scale=0, precision=6), + False, + ), + ( + operator.sub, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + decimal.Decimal(2), + ["-98", "-198"], + cudf.Decimal64Dtype(scale=0, precision=6), + True, + ), + ( + operator.sub, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + 4, + ["-96", "-196"], + cudf.Decimal64Dtype(scale=0, precision=6), + True, + ), + ( + operator.sub, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + decimal.Decimal("2.5"), + ["-97.5", "-197.5"], + cudf.Decimal64Dtype(scale=1, precision=7), + True, + ), + ], +) +def test_binops_decimal_scalar(args): + op, lhs, l_dtype, rhs, expect, expect_dtype, reflect = args + + def decimal_series(input, dtype): + return cudf.Series( + [x if x is None else decimal.Decimal(x) for x in input], + dtype=dtype, + ) + + lhs = decimal_series(lhs, l_dtype) + expect = decimal_series(expect, expect_dtype) + + if reflect: + lhs, rhs = rhs, lhs + + got = op(lhs, rhs) + assert expect.dtype == got.dtype + utils.assert_eq(expect, got) + + @pytest.mark.parametrize( "dtype", [ diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index 58115cecee7..916e73ea381 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -1,6 +1,7 @@ import datetime import datetime as dt import re +from decimal import Decimal import numpy as np import pandas as pd @@ -16,6 +17,12 @@ TIMEDELTA_TYPES, ) +TEST_DECIMAL_TYPES = [ + cudf.Decimal64Dtype(1, 1), + cudf.Decimal64Dtype(4, 2), + cudf.Decimal64Dtype(4, -2), +] + SCALAR_VALUES = [ 0, -1, @@ -103,8 +110,14 @@ np.object_("asdf"), ] +DECIMAL_VALUES = [ + Decimal("100"), + Decimal("0.0042"), + Decimal("1.0042"), +] + -@pytest.mark.parametrize("value", SCALAR_VALUES) +@pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES) def test_scalar_host_initialization(value): s = cudf.Scalar(value) @@ -130,7 +143,24 @@ def test_scalar_device_initialization(value): assert s._is_host_value_current -@pytest.mark.parametrize("value", SCALAR_VALUES) +@pytest.mark.parametrize("value", DECIMAL_VALUES) +def test_scalar_device_initialization_decimal(value): + dtype = cudf.Decimal64Dtype._from_decimal(value) + column = cudf.Series([str(value)]).astype(dtype)._column + dev_slr = get_element(column, 0) + + s = cudf.Scalar(dev_slr) + + assert s._is_device_value_current + assert not s._is_host_value_current + + assert s.value == value + + assert s._is_device_value_current + assert s._is_host_value_current + + +@pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES) def test_scalar_roundtrip(value): s = cudf.Scalar(value) @@ -156,9 +186,19 @@ def test_scalar_roundtrip(value): @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"] + "dtype", + NUMERIC_TYPES + + DATETIME_TYPES + + TIMEDELTA_TYPES + + ["object"] + + TEST_DECIMAL_TYPES, ) def test_null_scalar(dtype): + if isinstance(dtype, cudf.Decimal64Dtype): + with pytest.raises(NotImplementedError): + s = cudf.Scalar(None, dtype=dtype) + return + s = cudf.Scalar(None, dtype=dtype) assert s.value is cudf.NA assert s.dtype == np.dtype(dtype) @@ -194,9 +234,19 @@ def test_generic_null_scalar_construction_fails(value): @pytest.mark.parametrize( - "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"] + "dtype", + NUMERIC_TYPES + + DATETIME_TYPES + + TIMEDELTA_TYPES + + ["object"] + + TEST_DECIMAL_TYPES, ) def test_scalar_dtype_and_validity(dtype): + if isinstance(dtype, cudf.Decimal64Dtype): + with pytest.raises(NotImplementedError): + s = cudf.Scalar(None, dtype=dtype) + return + s = cudf.Scalar(1, dtype=dtype) assert s.dtype == np.dtype(dtype) @@ -277,24 +327,33 @@ def test_scalar_invalid_implicit_conversion(cls, dtype): cls(slr) -@pytest.mark.parametrize("value", SCALAR_VALUES) +@pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES) def test_device_scalar_direct_construction(value): value = cudf.utils.utils.to_cudf_compatible_scalar(value) - dtype = value.dtype + dtype = ( + value.dtype + if not isinstance(value, Decimal) + else cudf.Decimal64Dtype._from_decimal(value) + ) s = cudf._lib.scalar.DeviceScalar(value, dtype) assert s.value == value or np.isnan(s.value) and np.isnan(value) - if dtype.char == "U": + if isinstance(dtype, cudf.Decimal64Dtype): + assert s.dtype.precision == dtype.precision + assert s.dtype.scale == dtype.scale + elif dtype.char == "U": assert s.dtype == "object" else: assert s.dtype == dtype -@pytest.mark.parametrize("value", SCALAR_VALUES) +@pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES) def test_construct_from_scalar(value): value = cudf.utils.utils.to_cudf_compatible_scalar(value) - x = cudf.Scalar(value, value.dtype) + x = cudf.Scalar( + value, value.dtype if not isinstance(value, Decimal) else None + ) y = cudf.Scalar(x) assert x.value == y.value or np.isnan(x.value) and np.isnan(y.value) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index be2b1bca2e0..61cae792bb5 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -4,6 +4,7 @@ import numbers from collections import namedtuple from collections.abc import Sequence +from decimal import Decimal import cupy as cp import numpy as np @@ -348,6 +349,9 @@ def to_cudf_compatible_scalar(val, dtype=None): "to cudf scalar" ) + if isinstance(val, Decimal): + return val + if isinstance(val, (np.ndarray, cp.ndarray)) and val.ndim == 0: val = val.item() @@ -578,6 +582,24 @@ def _get_nan_for_dtype(dtype): return np.float64("nan") +def _decimal_to_int64(decimal: Decimal) -> int: + """ + Scale a Decimal such that the result is the integer + that would result from removing the decimal point. + + Examples + -------- + >>> _decimal_to_int64(Decimal('1.42')) + 142 + >>> _decimal_to_int64(Decimal('0.0042')) + 42 + >>> _decimal_to_int64(Decimal('-1.004201')) + -1004201 + + """ + return int(f"{decimal:0f}".replace(".", "")) + + def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): error = TypeError( f"{op} not supported between {dtype_l} and {dtype_r} scalars" From 24f301605fe19c66c336e93d71565169b6fdcab1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 31 Mar 2021 17:36:50 -0700 Subject: [PATCH 14/59] Support groupby operations for decimal dtypes (#7731) This PR resolves #7687. It also does a bit of cleanup of the internals of the code base. There is more that I would like to do, but I'll probably punt everything to a future PR that I don't directly have to touch for this change in the interest of quickly resolving the issue. I still need help determining why a few aggregations aren't working. The problems fall into two groups: 1. The `var` and `std` aggregations currently don't fail, but they always return columns filled with NULLs. I found the implementation of the dispatch for these methods in `variance.cu`/`compound.cuh`, and at least nominally it seems like these methods _are not_ currently supported because the corresponding `enable_if_t` is based on whether the type satisfies `std::is_arithmetic`, which decimal types will not. However, I'm not sure whether the problem is that this classification is incorrect and these types are actually supported by `libcudf`, or if there really isn't an implementation; I tried to find one, but there are a lot of different files related to aggregation and I'm sure I didn't find all of them. If we simply don't have an implementation, I can remove these from the list of valid aggregations. 2. The `mean`, `quantile`, and `median` aggregations all raise a `RuntimeError` from `binaryop.hpp`: "Input must have fixed_point data_type." I've traced the error to the Cython `GroupBy.aggregate` method, specifically the line where it calls through to the underlying `c_obj`'s `aggregate` method. The call stack in C++ is pretty deep after that, though, and I haven't yet been able to pinpoint whether the failure is a missing cast somewhere (i.e. `libcudf` thinks that the column is a floating point type when it's really not) or if the problem lies elsewhere. **Update** Thanks to @codereport, I've now marked all the above as unsupported operations. After some discussion with other devs I've also handled the other extended types. I still need to write tests, but I think this PR is ready for review in its current form to identify if I've missed anything in the implementation. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) - Keith Kraus (https://github.com/kkraus14) URL: https://github.com/rapidsai/cudf/pull/7731 --- docs/cudf/source/groupby.md | 36 ++++---- python/cudf/cudf/_lib/groupby.pyx | 69 +++++++++++----- python/cudf/cudf/core/column/column.py | 14 +++- python/cudf/cudf/core/groupby/groupby.py | 82 +++++++++++------- python/cudf/cudf/tests/test_groupby.py | 101 ++++++++++++++++------- python/cudf/cudf/utils/dtypes.py | 8 +- 6 files changed, 208 insertions(+), 102 deletions(-) diff --git a/docs/cudf/source/groupby.md b/docs/cudf/source/groupby.md index 5376df261e7..8a0e5dddba0 100644 --- a/docs/cudf/source/groupby.md +++ b/docs/cudf/source/groupby.md @@ -120,24 +120,24 @@ a The following table summarizes the available aggregations and the types that support them: -| Aggregations\dtypes | Numeric | Datetime | String | Categorical | List | Struct | -| ------------------- | -------- | ------- | -------- | ----------- | ---- | ------ | -| count | ✅ | ✅ | ✅ | ✅ | | | -| size | ✅ | ✅ | ✅ | ✅ | | | -| sum | ✅ | ✅ | | | | | -| idxmin | ✅ | ✅ | | | | | -| idxmax | ✅ | ✅ | | | | | -| min | ✅ | ✅ | ✅ | | | | -| max | ✅ | ✅ | ✅ | | | | -| mean | ✅ | ✅ | | | | | -| var | ✅ | ✅ | | | | | -| std | ✅ | ✅ | | | | | -| quantile | ✅ | ✅ | | | | | -| median | ✅ | ✅ | | | | | -| nunique | ✅ | ✅ | ✅ | ✅ | | | -| nth | ✅ | ✅ | ✅ | | | | -| collect | ✅ | ✅ | ✅ | | ✅ | | -| unique | ✅ | ✅ | ✅ | ✅ | | | +| Aggregations\dtypes | Numeric | Datetime | String | Categorical | List | Struct | Interval | Decimal | +| ------------------- | -------- | ------- | -------- | ----------- | ---- | ------ | -------- | ------- | +| count | ✅ | ✅ | ✅ | ✅ | | | | ✅ | +| size | ✅ | ✅ | ✅ | ✅ | | | | ✅ | +| sum | ✅ | ✅ | | | | | | ✅ | +| idxmin | ✅ | ✅ | | | | | | ✅ | +| idxmax | ✅ | ✅ | | | | | | ✅ | +| min | ✅ | ✅ | ✅ | | | | | ✅ | +| max | ✅ | ✅ | ✅ | | | | | ✅ | +| mean | ✅ | ✅ | | | | | | | +| var | ✅ | ✅ | | | | | | | +| std | ✅ | ✅ | | | | | | | +| quantile | ✅ | ✅ | | | | | | | +| median | ✅ | ✅ | | | | | | | +| nunique | ✅ | ✅ | ✅ | ✅ | | | | ✅ | +| nth | ✅ | ✅ | ✅ | | | | | ✅ | +| collect | ✅ | ✅ | ✅ | | ✅ | | | ✅ | +| unique | ✅ | ✅ | ✅ | ✅ | | | | | ## GroupBy apply diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 713a2274a77..4584841dd33 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -3,6 +3,7 @@ from collections import defaultdict import numpy as np +import rmm from libcpp.pair cimport pair from libcpp.memory cimport unique_ptr @@ -20,25 +21,9 @@ cimport cudf._lib.cpp.groupby as libcudf_groupby cimport cudf._lib.cpp.aggregation as libcudf_aggregation -_GROUPBY_AGGS = { - "count", - "size", - "sum", - "idxmin", - "idxmax", - "min", - "max", - "mean", - "var", - "std", - "quantile", - "median", - "nunique", - "nth", - "collect", - "unique", -} - +# The sets below define the possible aggregations that can be performed on +# different dtypes. The uppercased versions of these strings correspond to +# elements of the AggregationKind enum. _CATEGORICAL_AGGS = { "count", "size", @@ -61,6 +46,24 @@ _LIST_AGGS = { "collect", } +_STRUCT_AGGS = { +} + +_INTERVAL_AGGS = { +} + +_DECIMAL_AGGS = { + "count", + "sum", + "argmin", + "argmax", + "min", + "max", + "nunique", + "nth", + "collect" +} + cdef class GroupBy: cdef unique_ptr[libcudf_groupby.groupby] c_obj @@ -197,7 +200,10 @@ def _drop_unsupported_aggs(Table values, aggs): from cudf.utils.dtypes import ( is_categorical_dtype, is_string_dtype, - is_list_dtype + is_list_dtype, + is_interval_dtype, + is_struct_dtype, + is_decimal_dtype, ) result = aggs.copy() @@ -220,6 +226,29 @@ def _drop_unsupported_aggs(Table values, aggs): for i, agg_name in enumerate(aggs[col_name]): if Aggregation(agg_name).kind not in _CATEGORICAL_AGGS: del result[col_name][i] + elif ( + is_struct_dtype(values._data[col_name].dtype) + ): + for i, agg_name in enumerate(aggs[col_name]): + if Aggregation(agg_name).kind not in _STRUCT_AGGS: + del result[col_name][i] + elif ( + is_interval_dtype(values._data[col_name].dtype) + ): + for i, agg_name in enumerate(aggs[col_name]): + if Aggregation(agg_name).kind not in _INTERVAL_AGGS: + del result[col_name][i] + elif ( + is_decimal_dtype(values._data[col_name].dtype) + ): + if rmm._cuda.gpu.runtimeGetVersion() < 11000: + raise RuntimeError( + "Decimal aggregations are only supported on CUDA >= 11 " + "due to an nvcc compiler bug." + ) + for i, agg_name in enumerate(aggs[col_name]): + if Aggregation(agg_name).kind not in _DECIMAL_AGGS: + del result[col_name][i] if all(len(v) == 0 for v in result.values()): raise DataError("No numeric types to aggregate") diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e59b395ec0f..8531ec18edc 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -426,6 +426,8 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase: array.type, pd.core.arrays._arrow_utils.ArrowIntervalType ): return cudf.core.column.IntervalColumn.from_arrow(array) + elif isinstance(array.type, pa.Decimal128Type): + return cudf.core.column.DecimalColumn.from_arrow(array) return libcudf.interop.from_arrow(data, data.column_names)._data[ "None" @@ -1846,10 +1848,14 @@ def as_column( cupy.asarray(arbitrary), nan_as_null=nan_as_null, dtype=dtype ) else: - data = as_column( - pa.array(arbitrary, from_pandas=nan_as_null), - dtype=arbitrary.dtype, - ) + pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null) + if isinstance(pyarrow_array.type, pa.Decimal128Type): + pyarrow_type = cudf.Decimal64Dtype.from_arrow( + pyarrow_array.type + ) + else: + pyarrow_type = arbitrary.dtype + data = as_column(pyarrow_array, dtype=pyarrow_type) if dtype is not None: data = data.astype(dtype) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 86e1f5cfe30..cc94548d9a2 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -13,6 +13,8 @@ from cudf.utils.utils import cached_property +# Note that all valid aggregation methods (e.g. GroupBy.min) are bound to the +# class after its definition (see below). class GroupBy(Serializable): _MAX_GROUPS_BEFORE_WARN = 100 @@ -58,14 +60,6 @@ def __init__( else: self.grouping = _Grouping(obj, by, level) - def __getattribute__(self, key): - try: - return super().__getattribute__(key) - except AttributeError: - if key in libgroupby._GROUPBY_AGGS: - return functools.partial(self._agg_func_name_with_args, key) - raise - def __iter__(self): group_names, offsets, _, grouped_values = self._grouped() if isinstance(group_names, cudf.Index): @@ -267,19 +261,6 @@ def _grouped(self): group_names = grouped_keys.unique() return (group_names, offsets, grouped_keys, grouped_values) - def _agg_func_name_with_args(self, func_name, *args, **kwargs): - """ - Aggregate given an aggregate function name - and arguments to the function, e.g., - `_agg_func_name_with_args("quantile", 0.5)` - """ - - def func(x): - return getattr(x, func_name)(*args, **kwargs) - - func.__name__ = func_name - return self.agg(func) - def _normalize_aggs(self, aggs): """ Normalize aggs to a dict mapping column names @@ -590,6 +571,48 @@ def rolling(self, *args, **kwargs): return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs) +# Set of valid groupby aggregations that are monkey-patched into the GroupBy +# namespace. +_VALID_GROUPBY_AGGS = { + "count", + "sum", + "idxmin", + "idxmax", + "min", + "max", + "mean", + "var", + "std", + "quantile", + "median", + "nunique", + "collect", + "unique", +} + + +# Dynamically bind the different aggregation methods. +def _agg_func_name_with_args(self, func_name, *args, **kwargs): + """ + Aggregate given an aggregate function name and arguments to the + function, e.g., `_agg_func_name_with_args("quantile", 0.5)`. The named + aggregations must be members of _AggregationFactory. + """ + + def func(x): + """Compute the {} of the group.""".format(func_name) + return getattr(x, func_name)(*args, **kwargs) + + func.__name__ = func_name + return self.agg(func) + + +for key in _VALID_GROUPBY_AGGS: + setattr( + GroupBy, key, functools.partialmethod(_agg_func_name_with_args, key) + ) + + class DataFrameGroupBy(GroupBy): def __init__( self, obj, by=None, level=None, sort=False, as_index=True, dropna=True @@ -685,15 +708,16 @@ def __init__( dropna=dropna, ) - def __getattribute__(self, key): + def __getattr__(self, key): + # Without this check, copying can trigger a RecursionError. See + # https://nedbatchelder.com/blog/201010/surprising_getattr_recursion.html # noqa: E501 + # for an explanation. + if key == "obj": + raise AttributeError try: - return super().__getattribute__(key) - except AttributeError: - if key in self.obj: - return self.obj[key].groupby( - self.grouping, dropna=self._dropna, sort=self._sort - ) - raise + return self[key] + except KeyError: + raise AttributeError def __getitem__(self, key): return self.obj[key].groupby( diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index a96db59dee3..84b52b1befb 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -8,6 +8,7 @@ import pytest from numba import cuda from numpy.testing import assert_array_equal +from decimal import Decimal import cudf from cudf.core import DataFrame, Series @@ -20,6 +21,8 @@ assert_exceptions_equal, ) +import rmm + _now = np.datetime64("now") _tomorrow = _now + np.timedelta64(1, "D") _now = np.int64(_now.astype("datetime64[ns]")) @@ -148,26 +151,6 @@ def test_groupby_agg_min_max_dictlist(nelem): assert_eq(got_df, expect_df) -@pytest.mark.parametrize("nelem", [2, 3, 100, 1000]) -@pytest.mark.parametrize( - "func", ["mean", "min", "max", "idxmin", "idxmax", "count", "sum"] -) -def test_groupby_2keys_agg(nelem, func): - # gdf (Note: lack of multiIndex) - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) - ) - got_df = ( - make_frame(DataFrame, nelem=nelem) - .groupby(["x", "y"], sort=True) - .agg(func) - ) - check_dtype = False if func in _index_type_aggs else True - assert_eq(got_df, expect_df, check_dtype=check_dtype) - - @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_as_index_single_agg(pdf, gdf, as_index): gdf = gdf.groupby("y", as_index=as_index, sort=True).agg({"x": "mean"}) @@ -331,28 +314,90 @@ def emulate(df): assert_eq(expect, got) -@pytest.mark.parametrize("nelem", [100, 500]) +@pytest.mark.parametrize("nelem", [2, 3, 100, 500, 1000]) @pytest.mark.parametrize( "func", ["mean", "std", "var", "min", "max", "idxmin", "idxmax", "count", "sum"], ) -def test_groupby_cudf_2keys_agg(nelem, func): - got_df = ( - make_frame(DataFrame, nelem=nelem) +def test_groupby_2keys_agg(nelem, func): + # gdf (Note: lack of multiIndex) + expect_df = ( + make_frame(pd.DataFrame, nelem=nelem) .groupby(["x", "y"], sort=True) .agg(func) ) - - # pandas - expect_df = ( - make_frame(pd.DataFrame, nelem=nelem) + got_df = ( + make_frame(DataFrame, nelem=nelem) .groupby(["x", "y"], sort=True) .agg(func) ) + check_dtype = False if func in _index_type_aggs else True assert_eq(got_df, expect_df, check_dtype=check_dtype) +@pytest.mark.parametrize("num_groups", [2, 3, 10, 50, 100]) +@pytest.mark.parametrize("nelem_per_group", [1, 10, 100]) +@pytest.mark.parametrize( + "func", + ["min", "max", "count", "sum"], + # TODO: Replace the above line with the one below once + # https://github.com/pandas-dev/pandas/issues/40685 is resolved. + # "func", ["min", "max", "idxmin", "idxmax", "count", "sum"], +) +def test_groupby_agg_decimal(num_groups, nelem_per_group, func): + # The number of digits after the decimal to use. + decimal_digits = 2 + # The number of digits before the decimal to use. + whole_digits = 2 + + scale = 10 ** whole_digits + nelem = num_groups * nelem_per_group + + # The unique is necessary because otherwise if there are duplicates idxmin + # and idxmax may return different results than pandas (see + # https://github.com/rapidsai/cudf/issues/7756). This is not relevant to + # the current version of the test, because idxmin and idxmax simply don't + # work with pandas Series composed of Decimal objects (see + # https://github.com/pandas-dev/pandas/issues/40685). However, if that is + # ever enabled, then this issue will crop up again so we may as well have + # it fixed now. + x = np.unique((np.random.rand(nelem) * scale).round(decimal_digits)) + y = np.unique((np.random.rand(nelem) * scale).round(decimal_digits)) + + if x.size < y.size: + total_elements = x.size + y = y[: x.size] + else: + total_elements = y.size + x = x[: y.size] + + # Note that this filtering can lead to one group with fewer elements, but + # that shouldn't be a problem and is probably useful to test. + idx_col = np.tile(np.arange(num_groups), nelem_per_group)[:total_elements] + + decimal_x = pd.Series([Decimal(str(d)) for d in x]) + decimal_y = pd.Series([Decimal(str(d)) for d in y]) + + pdf = pd.DataFrame({"idx": idx_col, "x": decimal_x, "y": decimal_y}) + gdf = DataFrame( + { + "idx": idx_col, + "x": cudf.Series(decimal_x), + "y": cudf.Series(decimal_y), + } + ) + + expect_df = pdf.groupby("idx", sort=True).agg(func) + if rmm._cuda.gpu.runtimeGetVersion() < 11000: + with pytest.raises(RuntimeError): + got_df = gdf.groupby("idx", sort=True).agg(func) + else: + got_df = gdf.groupby("idx", sort=True).agg(func) + assert_eq(expect_df["x"], got_df["x"], check_dtype=False) + assert_eq(expect_df["y"], got_df["y"], check_dtype=False) + + @pytest.mark.parametrize( "agg", ["min", "max", "idxmin", "idxmax", "count", "sum", "mean"] ) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 61cae792bb5..5cb0391d76f 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -295,9 +295,11 @@ def cudf_dtype_to_pa_type(dtype): """ if is_categorical_dtype(dtype): raise NotImplementedError() - elif is_list_dtype(dtype): - return dtype.to_arrow() - elif is_struct_dtype(dtype): + elif ( + is_list_dtype(dtype) + or is_struct_dtype(dtype) + or is_decimal_dtype(dtype) + ): return dtype.to_arrow() else: return np_to_pa_dtype(np.dtype(dtype)) From 5f1dc795a74778cad247ef00ffd40b2ab7a7e30e Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Wed, 31 Mar 2021 18:30:11 -0700 Subject: [PATCH 15/59] Add decimal column comparison operations (#7716) Closes #7498 This PR adds binary comparison `eq`, `lt`, `gt`, `le`, `ge` to - [x] decimal column v. decimal column - [x] decimal column v. decimal scalar (`decimal.Decimal` and decimal `cudf.Scalar`) - [x] decimal column v. integer column (`cudf.utils.dtypes.INTEGER_TYPES`) - [x] decimal column v. integer scalar (Python ints) Other minor adds: - Supports binary ops between `cudf.DecimalColumn` and `cudf.Scalar`, where `Scalar.dtype` is `cudf.Decimal64Dtype` (follow up for #7732 ) - Short comment noting use of decimal64 in `decimals.pxd` - Adding decimal data type in `basics.rst` Authors: - Michael Wang (https://github.com/isVoid) Approvers: - Keith Kraus (https://github.com/kkraus14) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/7716 --- docs/cudf/source/basics.rst | 2 + .../cudf/cudf/_lib/cpp/wrappers/decimals.pxd | 1 + python/cudf/cudf/core/column/decimal.py | 62 ++- python/cudf/cudf/core/column/numerical.py | 15 +- python/cudf/cudf/tests/test_binops.py | 514 +++++++++++++++++- 5 files changed, 568 insertions(+), 26 deletions(-) diff --git a/docs/cudf/source/basics.rst b/docs/cudf/source/basics.rst index e270708df90..15b4b43662b 100644 --- a/docs/cudf/source/basics.rst +++ b/docs/cudf/source/basics.rst @@ -34,6 +34,8 @@ The following table lists all of cudf types. For methods requiring dtype argumen +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ | Boolean | | np.bool_ | ``'bool'`` | +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ +| Decimal | Decimal64Dtype | (none) | (none) | ++------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ **Note: All dtypes above are Nullable** diff --git a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd index a73e6e0151d..9de23fb2595 100644 --- a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd +++ b/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport int64_t, int32_t cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil: + # cython type stub to help resolve to numeric::decimal64 ctypedef int64_t decimal64 cdef cppclass scale_type: diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 971d849d970..e93c5824817 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,24 +1,24 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -import cudf +from decimal import Decimal +from typing import cast + import cupy as cp import numpy as np import pyarrow as pa -from typing import cast +from pandas.api.types import is_integer_dtype +import cudf from cudf import _lib as libcudf -from cudf.core.buffer import Buffer -from cudf.core.column import ColumnBase -from cudf.core.dtypes import Decimal64Dtype -from cudf.utils.utils import pa_mask_buffer_to_mask - -from cudf._typing import Dtype from cudf._lib.strings.convert.convert_fixed_point import ( from_decimal as cpp_from_decimal, ) -from cudf.core.column import as_column -from decimal import Decimal +from cudf._typing import Dtype +from cudf.core.buffer import Buffer +from cudf.core.column import ColumnBase, as_column +from cudf.core.dtypes import Decimal64Dtype from cudf.utils.dtypes import is_scalar +from cudf.utils.utils import pa_mask_buffer_to_mask class DecimalColumn(ColumnBase): @@ -65,17 +65,47 @@ def to_arrow(self): def binary_operator(self, op, other, reflect=False): if reflect: self, other = other, self - scale = _binop_scale(self.dtype, other.dtype, op) - output_type = Decimal64Dtype( - scale=scale, precision=Decimal64Dtype.MAX_PRECISION - ) # precision will be ignored, libcudf has no notion of precision - result = libcudf.binaryop.binaryop(self, other, op, output_type) - result.dtype.precision = _binop_precision(self.dtype, other.dtype, op) + + # Binary Arithmatics between decimal columns. `Scale` and `precision` + # are computed outside of libcudf + if op in ("add", "sub", "mul"): + scale = _binop_scale(self.dtype, other.dtype, op) + output_type = Decimal64Dtype( + scale=scale, precision=Decimal64Dtype.MAX_PRECISION + ) # precision will be ignored, libcudf has no notion of precision + result = libcudf.binaryop.binaryop(self, other, op, output_type) + result.dtype.precision = _binop_precision( + self.dtype, other.dtype, op + ) + elif op in ("eq", "lt", "gt", "le", "ge"): + if not isinstance( + other, + (DecimalColumn, cudf.core.column.NumericalColumn, cudf.Scalar), + ): + raise TypeError( + f"Operator {op} not supported between" + f"{str(type(self))} and {str(type(other))}" + ) + if isinstance( + other, cudf.core.column.NumericalColumn + ) and not is_integer_dtype(other.dtype): + raise TypeError( + f"Only decimal and integer column is supported for {op}." + ) + if isinstance(other, cudf.core.column.NumericalColumn): + other = other.as_decimal_column( + Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0) + ) + result = libcudf.binaryop.binaryop(self, other, op, bool) return result def normalize_binop_value(self, other): if is_scalar(other) and isinstance(other, (int, np.int, Decimal)): return cudf.Scalar(Decimal(other)) + elif isinstance(other, cudf.Scalar) and isinstance( + other.dtype, cudf.Decimal64Dtype + ): + return other else: raise TypeError(f"cannot normalize {type(other)}") diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index f58a47a918c..10a9ffbfbae 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -22,6 +22,7 @@ column, string, ) +from cudf.core.dtypes import Decimal64Dtype from cudf.utils import cudautils, utils from cudf.utils.dtypes import ( min_column_type, @@ -103,11 +104,23 @@ def binary_operator( out_dtype = self.dtype else: if not ( - isinstance(rhs, (NumericalColumn, cudf.Scalar,),) + isinstance( + rhs, + ( + NumericalColumn, + cudf.Scalar, + cudf.core.column.DecimalColumn, + ), + ) or np.isscalar(rhs) ): msg = "{!r} operator not supported between {} and {}" raise TypeError(msg.format(binop, type(self), type(rhs))) + if isinstance(rhs, cudf.core.column.DecimalColumn): + lhs = self.as_decimal_column( + Decimal64Dtype(Decimal64Dtype.MAX_PRECISION, 0) + ) + return lhs.binary_operator(binop, rhs) out_dtype = np.result_type(self.dtype, rhs.dtype) if binop in ["mod", "floordiv"]: tmp = self if reflect else rhs diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 2e2992fc524..ac80071c8e4 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -1615,6 +1615,12 @@ def test_binops_with_NA_consistent(dtype, op): assert result._column.null_count == len(data) +def _decimal_series(input, dtype): + return cudf.Series( + [x if x is None else decimal.Decimal(x) for x in input], dtype=dtype, + ) + + @pytest.mark.parametrize( "args", [ @@ -1753,26 +1759,311 @@ def test_binops_with_NA_consistent(dtype, op): ["10.0", None], cudf.Decimal64Dtype(scale=1, precision=8), ), + ( + operator.eq, + ["0.18", "0.42"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.18", "0.21"], + cudf.Decimal64Dtype(scale=2, precision=3), + [True, False], + bool, + ), + ( + operator.eq, + ["0.18", "0.42"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.1800", "0.2100"], + cudf.Decimal64Dtype(scale=4, precision=5), + [True, False], + bool, + ), + ( + operator.eq, + ["100", None], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["100", "200"], + cudf.Decimal64Dtype(scale=-1, precision=4), + [True, None], + bool, + ), + ( + operator.lt, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.10", "0.87", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + [False, True, False], + bool, + ), + ( + operator.lt, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.1000", "0.8700", "1.0000"], + cudf.Decimal64Dtype(scale=4, precision=5), + [False, True, False], + bool, + ), + ( + operator.lt, + ["200", None, "100"], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["100", "200", "100"], + cudf.Decimal64Dtype(scale=-1, precision=4), + [False, None, False], + bool, + ), + ( + operator.gt, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.10", "0.87", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + [True, False, False], + bool, + ), + ( + operator.gt, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.1000", "0.8700", "1.0000"], + cudf.Decimal64Dtype(scale=4, precision=5), + [True, False, False], + bool, + ), + ( + operator.gt, + ["300", None, "100"], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["100", "200", "100"], + cudf.Decimal64Dtype(scale=-1, precision=4), + [True, None, False], + bool, + ), + ( + operator.le, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.10", "0.87", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + [False, True, True], + bool, + ), + ( + operator.le, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.1000", "0.8700", "1.0000"], + cudf.Decimal64Dtype(scale=4, precision=5), + [False, True, True], + bool, + ), + ( + operator.le, + ["300", None, "100"], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["100", "200", "100"], + cudf.Decimal64Dtype(scale=-1, precision=4), + [False, None, True], + bool, + ), + ( + operator.ge, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.10", "0.87", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + [True, False, True], + bool, + ), + ( + operator.ge, + ["0.18", "0.42", "1.00"], + cudf.Decimal64Dtype(scale=2, precision=3), + ["0.1000", "0.8700", "1.0000"], + cudf.Decimal64Dtype(scale=4, precision=5), + [True, False, True], + bool, + ), + ( + operator.ge, + ["300", None, "100"], + cudf.Decimal64Dtype(scale=-2, precision=3), + ["100", "200", "100"], + cudf.Decimal64Dtype(scale=-1, precision=4), + [True, None, True], + bool, + ), ], ) def test_binops_decimal(args): op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype = args - def decimal_series(input, dtype): - return cudf.Series( - [x if x is None else decimal.Decimal(x) for x in input], - dtype=dtype, - ) - - a = decimal_series(lhs, l_dtype) - b = decimal_series(rhs, r_dtype) - expect = decimal_series(expect, expect_dtype) + a = _decimal_series(lhs, l_dtype) + b = _decimal_series(rhs, r_dtype) + expect = ( + _decimal_series(expect, expect_dtype) + if isinstance(expect_dtype, cudf.Decimal64Dtype) + else cudf.Series(expect, dtype=expect_dtype) + ) got = op(a, b) assert expect.dtype == got.dtype utils.assert_eq(expect, got) +@pytest.mark.parametrize( + "args", + [ + ( + operator.eq, + ["100", "41", None], + cudf.Decimal64Dtype(scale=0, precision=5), + [100, 42, 12], + cudf.Series([True, False, None], dtype=bool), + cudf.Series([True, False, None], dtype=bool), + ), + ( + operator.eq, + ["100.000", "42.001", None], + cudf.Decimal64Dtype(scale=3, precision=6), + [100, 42, 12], + cudf.Series([True, False, None], dtype=bool), + cudf.Series([True, False, None], dtype=bool), + ), + ( + operator.eq, + ["100", "40", None], + cudf.Decimal64Dtype(scale=-1, precision=3), + [100, 42, 12], + cudf.Series([True, False, None], dtype=bool), + cudf.Series([True, False, None], dtype=bool), + ), + ( + operator.lt, + ["100", "40", "28", None], + cudf.Decimal64Dtype(scale=0, precision=3), + [100, 42, 24, 12], + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.lt, + ["100.000", "42.002", "23.999", None], + cudf.Decimal64Dtype(scale=3, precision=6), + [100, 42, 24, 12], + cudf.Series([False, False, True, None], dtype=bool), + cudf.Series([False, True, False, None], dtype=bool), + ), + ( + operator.lt, + ["100", "40", "10", None], + cudf.Decimal64Dtype(scale=-1, precision=3), + [100, 42, 8, 12], + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.gt, + ["100", "42", "20", None], + cudf.Decimal64Dtype(scale=0, precision=3), + [100, 40, 24, 12], + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.gt, + ["100.000", "42.002", "23.999", None], + cudf.Decimal64Dtype(scale=3, precision=6), + [100, 42, 24, 12], + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.gt, + ["100", "40", "10", None], + cudf.Decimal64Dtype(scale=-1, precision=3), + [100, 42, 8, 12], + cudf.Series([False, False, True, None], dtype=bool), + cudf.Series([False, True, False, None], dtype=bool), + ), + ( + operator.le, + ["100", "40", "28", None], + cudf.Decimal64Dtype(scale=0, precision=3), + [100, 42, 24, 12], + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ( + operator.le, + ["100.000", "42.002", "23.999", None], + cudf.Decimal64Dtype(scale=3, precision=6), + [100, 42, 24, 12], + cudf.Series([True, False, True, None], dtype=bool), + cudf.Series([True, True, False, None], dtype=bool), + ), + ( + operator.le, + ["100", "40", "10", None], + cudf.Decimal64Dtype(scale=-1, precision=3), + [100, 42, 8, 12], + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ( + operator.ge, + ["100", "42", "20", None], + cudf.Decimal64Dtype(scale=0, precision=3), + [100, 40, 24, 12], + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ( + operator.ge, + ["100.000", "42.002", "23.999", None], + cudf.Decimal64Dtype(scale=3, precision=6), + [100, 42, 24, 12], + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ( + operator.ge, + ["100", "40", "10", None], + cudf.Decimal64Dtype(scale=-1, precision=3), + [100, 42, 8, 12], + cudf.Series([True, False, True, None], dtype=bool), + cudf.Series([True, True, False, None], dtype=bool), + ), + ], +) +@pytest.mark.parametrize("integer_dtype", cudf.tests.utils.INTEGER_TYPES) +@pytest.mark.parametrize("reflected", [True, False]) +def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected): + """ + Tested compare operations: + eq, lt, gt, le, ge + Each operation has 3 decimal data setups, with scale from {==0, >0, <0}. + Decimal precisions are sufficient to hold the digits. + For each decimal data setup, there is at least one row that lead to one + of the following compare results: {True, False, None}. + """ + if not reflected: + op, ldata, ldtype, rdata, expected, _ = args + else: + op, ldata, ldtype, rdata, _, expected = args + + lhs = _decimal_series(ldata, ldtype) + rhs = cudf.Series(rdata, dtype=integer_dtype) + + if reflected: + rhs, lhs = lhs, rhs + + actual = op(lhs, rhs) + + utils.assert_eq(expected, actual) + + @pytest.mark.parametrize( "args", [ @@ -1803,6 +2094,15 @@ def decimal_series(input, dtype): cudf.Decimal64Dtype(scale=1, precision=7), False, ), + ( + operator.add, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + cudf.Scalar(decimal.Decimal("1.5")), + ["101.5", "201.5"], + cudf.Decimal64Dtype(scale=1, precision=7), + False, + ), ( operator.add, ["100", "200"], @@ -1830,6 +2130,15 @@ def decimal_series(input, dtype): cudf.Decimal64Dtype(scale=1, precision=7), True, ), + ( + operator.add, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + cudf.Scalar(decimal.Decimal("1.5")), + ["101.5", "201.5"], + cudf.Decimal64Dtype(scale=1, precision=7), + True, + ), ( operator.mul, ["100", "200"], @@ -1857,6 +2166,15 @@ def decimal_series(input, dtype): cudf.Decimal64Dtype(scale=-1, precision=6), False, ), + ( + operator.mul, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + cudf.Scalar(decimal.Decimal("1.5")), + ["150", "300"], + cudf.Decimal64Dtype(scale=-1, precision=6), + False, + ), ( operator.mul, ["100", "200"], @@ -1884,6 +2202,15 @@ def decimal_series(input, dtype): cudf.Decimal64Dtype(scale=-1, precision=6), True, ), + ( + operator.mul, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + cudf.Scalar(decimal.Decimal("1.5")), + ["150", "300"], + cudf.Decimal64Dtype(scale=-1, precision=6), + True, + ), ( operator.sub, ["100", "200"], @@ -1911,6 +2238,15 @@ def decimal_series(input, dtype): cudf.Decimal64Dtype(scale=0, precision=6), False, ), + ( + operator.sub, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + cudf.Scalar(decimal.Decimal("2.5")), + ["97.5", "197.5"], + cudf.Decimal64Dtype(scale=1, precision=7), + False, + ), ( operator.sub, ["100", "200"], @@ -1938,6 +2274,15 @@ def decimal_series(input, dtype): cudf.Decimal64Dtype(scale=1, precision=7), True, ), + ( + operator.sub, + ["100", "200"], + cudf.Decimal64Dtype(scale=-2, precision=3), + cudf.Scalar(decimal.Decimal("2.5")), + ["-97.5", "-197.5"], + cudf.Decimal64Dtype(scale=1, precision=7), + True, + ), ], ) def test_binops_decimal_scalar(args): @@ -1960,6 +2305,157 @@ def decimal_series(input, dtype): utils.assert_eq(expect, got) +@pytest.mark.parametrize( + "args", + [ + ( + operator.eq, + ["100.00", "41", None], + cudf.Decimal64Dtype(scale=0, precision=5), + 100, + cudf.Series([True, False, None], dtype=bool), + cudf.Series([True, False, None], dtype=bool), + ), + ( + operator.eq, + ["100.123", "41", None], + cudf.Decimal64Dtype(scale=3, precision=6), + decimal.Decimal("100.123"), + cudf.Series([True, False, None], dtype=bool), + cudf.Series([True, False, None], dtype=bool), + ), + ( + operator.eq, + ["100.123", "41", None], + cudf.Decimal64Dtype(scale=3, precision=6), + cudf.Scalar(decimal.Decimal("100.123")), + cudf.Series([True, False, None], dtype=bool), + cudf.Series([True, False, None], dtype=bool), + ), + ( + operator.gt, + ["100.00", "41", "120.21", None], + cudf.Decimal64Dtype(scale=2, precision=5), + 100, + cudf.Series([False, False, True, None], dtype=bool), + cudf.Series([False, True, False, None], dtype=bool), + ), + ( + operator.gt, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + decimal.Decimal("100.123"), + cudf.Series([False, False, True, None], dtype=bool), + cudf.Series([False, True, False, None], dtype=bool), + ), + ( + operator.gt, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + cudf.Scalar(decimal.Decimal("100.123")), + cudf.Series([False, False, True, None], dtype=bool), + cudf.Series([False, True, False, None], dtype=bool), + ), + ( + operator.ge, + ["100.00", "41", "120.21", None], + cudf.Decimal64Dtype(scale=2, precision=5), + 100, + cudf.Series([True, False, True, None], dtype=bool), + cudf.Series([True, True, False, None], dtype=bool), + ), + ( + operator.ge, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + decimal.Decimal("100.123"), + cudf.Series([True, False, True, None], dtype=bool), + cudf.Series([True, True, False, None], dtype=bool), + ), + ( + operator.ge, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + cudf.Scalar(decimal.Decimal("100.123")), + cudf.Series([True, False, True, None], dtype=bool), + cudf.Series([True, True, False, None], dtype=bool), + ), + ( + operator.lt, + ["100.00", "41", "120.21", None], + cudf.Decimal64Dtype(scale=2, precision=5), + 100, + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.lt, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + decimal.Decimal("100.123"), + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.lt, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + cudf.Scalar(decimal.Decimal("100.123")), + cudf.Series([False, True, False, None], dtype=bool), + cudf.Series([False, False, True, None], dtype=bool), + ), + ( + operator.le, + ["100.00", "41", "120.21", None], + cudf.Decimal64Dtype(scale=2, precision=5), + 100, + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ( + operator.le, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + decimal.Decimal("100.123"), + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ( + operator.le, + ["100.123", "41", "120.21", None], + cudf.Decimal64Dtype(scale=3, precision=6), + cudf.Scalar(decimal.Decimal("100.123")), + cudf.Series([True, True, False, None], dtype=bool), + cudf.Series([True, False, True, None], dtype=bool), + ), + ], +) +@pytest.mark.parametrize("reflected", [True, False]) +def test_binops_decimal_scalar_compare(args, reflected): + """ + Tested compare operations: + eq, lt, gt, le, ge + Each operation has 3 data setups: pyints, Decimal, and + decimal cudf.Scalar + For each data setup, there is at least one row that lead to one of the + following compare results: {True, False, None}. + """ + if not reflected: + op, ldata, ldtype, rdata, expected, _ = args + else: + op, ldata, ldtype, rdata, _, expected = args + + lhs = _decimal_series(ldata, ldtype) + rhs = rdata + + if reflected: + rhs, lhs = lhs, rhs + + actual = op(lhs, rhs) + + utils.assert_eq(expected, actual) + + @pytest.mark.parametrize( "dtype", [ From e8f62ea16d6918cdb7821258984f625d4431fda1 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Wed, 31 Mar 2021 19:03:12 -0700 Subject: [PATCH 16/59] User resource fix for replace_nulls (#7769) `cudf::replace_nulls` was copying the input column with the default stream and resource when there is no null. This simple PR is to make sure to pass the right stream and resource to the copy constructor. Authors: - Wonchan Lee (https://github.com/magnatelee) Approvers: - Mark Harris (https://github.com/harrism) - Mike Wilson (https://github.com/hyperbolic2346) - MithunR (https://github.com/mythrocks) - Nghia Truong (https://github.com/ttnghia) - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/7769 --- cpp/src/replace/nulls.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index 65750deaa57..4cf6899116d 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -424,7 +424,7 @@ std::unique_ptr replace_nulls(cudf::column_view const& input, CUDF_EXPECTS(replacement.size() == input.size(), "Column size mismatch"); if (input.is_empty()) { return cudf::empty_like(input); } - if (!input.has_nulls()) { return std::make_unique(input); } + if (!input.has_nulls()) { return std::make_unique(input, stream, mr); } return cudf::type_dispatcher( input.type(), replace_nulls_column_kernel_forwarder{}, input, replacement, stream, mr); From 6cab04a684510f827e32820e9da7faa9741190c6 Mon Sep 17 00:00:00 2001 From: Jake Hemstad Date: Wed, 31 Mar 2021 22:03:21 -0500 Subject: [PATCH 17/59] Fix compiler warning about non-POD types passed through ellipsis (#7781) In https://github.com/rapidsai/cudf/pull/7725/ I updated some of the fall through error cases to use old, [C-style ellipsis variadics ](https://github.com/jrhemstad/cudf/blob/f92d4626c7a4296075c5aa6240d6e3b88049abe6/cpp/include/cudf/detail/gather.cuh#L145 )since we don't care about the parameters in the error case. Turns out C++ forbids passing "non-POD" types through this kind of variadic ellipsis, which ends up generating a compiler warning: > warning: non-POD class type passed through ellipsis Authors: - Jake Hemstad (https://github.com/jrhemstad) Approvers: - David Wendt (https://github.com/davidwendt) - Nghia Truong (https://github.com/ttnghia) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/7781 --- cpp/include/cudf/detail/gather.cuh | 6 +++++- cpp/include/cudf/detail/scatter.cuh | 6 +++++- cpp/src/copying/copy.cu | 6 +++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index bf488621d52..936c6d9307c 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -142,7 +142,11 @@ void gather_helper(InputItr source_itr, // Error case when no other overload or specialization is available template struct column_gatherer_impl { - std::unique_ptr operator()(...) { CUDF_FAIL("Unsupported type in gather."); } + template + std::unique_ptr operator()(Args&&...) + { + CUDF_FAIL("Unsupported type in gather."); + } }; /** diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh index 30764b9b89f..b51c44772b5 100644 --- a/cpp/include/cudf/detail/scatter.cuh +++ b/cpp/include/cudf/detail/scatter.cuh @@ -81,7 +81,11 @@ auto scatter_to_gather(MapIterator scatter_map_begin, template struct column_scatterer_impl { - std::unique_ptr operator()(...) const { CUDF_FAIL("Unsupported type for scatter."); } + template + std::unique_ptr operator()(Args&&...) const + { + CUDF_FAIL("Unsupported type for scatter."); + } }; template diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu index e6adc027acc..fecf7d18d46 100644 --- a/cpp/src/copying/copy.cu +++ b/cpp/src/copying/copy.cu @@ -31,7 +31,11 @@ namespace { template struct copy_if_else_functor_impl { - std::unique_ptr operator()(...) { CUDF_FAIL("Unsupported type for copy_if_else."); } + template + std::unique_ptr operator()(Args&&...) + { + CUDF_FAIL("Unsupported type for copy_if_else."); + } }; template From 299f6cc11fb2d61d64bf0041c192e4d232e0f8e5 Mon Sep 17 00:00:00 2001 From: Wonchan Lee Date: Thu, 1 Apr 2021 02:52:41 -0700 Subject: [PATCH 18/59] Allow hash_partition to take a seed value (#7771) This PR is to allow hash partitioning to configure the seed of its hash function. As noted in #6307, using the same hash function in hash partitioning and join leads to a massive hash collision and severely degrades join performance on multiple GPUs. There was an initial fix (#6726) to this problem, but it added only the code path to use identity hash function in hash partitioning, which doesn't support complex data types and thus cannot be used in general. In fact, using the same general Murmur3 hash function with different seeds in hash partitioning and join turned out to be a sufficient fix. This PR is to enable such configurations by making `hash_partition` accept an optional seed value. Authors: - Wonchan Lee (https://github.com/magnatelee) Approvers: - https://github.com/gaohao95 - Mark Harris (https://github.com/harrism) - https://github.com/nvdbaranec - Jake Hemstad (https://github.com/jrhemstad) URL: https://github.com/rapidsai/cudf/pull/7771 --- .../cudf/detail/utilities/hash_functions.cuh | 30 +++++++++++++------ cpp/include/cudf/hashing.hpp | 2 +- cpp/include/cudf/partitioning.hpp | 4 +++ cpp/include/cudf/table/row_operators.cuh | 27 ++++++++++++----- cpp/include/cudf/types.hpp | 5 ++++ cpp/src/partitioning/partitioning.cu | 13 ++++---- .../partitioning/hash_partition_test.cpp | 28 +++++++++++++++++ 7 files changed, 87 insertions(+), 22 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index e79107e32cf..7f3c05134e2 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -20,6 +20,7 @@ #include #include #include +#include #include using hash_value_type = uint32_t; @@ -231,6 +232,9 @@ MD5ListHasher::operator()(column_device_view data_col, } struct MD5Hash { + MD5Hash() = default; + constexpr MD5Hash(uint32_t seed) : m_seed(seed) {} + void __device__ finalize(md5_intermediate_data* hash_state, char* result_location) const { auto const full_length = (static_cast(hash_state->message_length)) << 3; @@ -302,6 +306,9 @@ struct MD5Hash { { md5_process(col.element(row_index), hash_state); } + + private: + uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; }; template <> @@ -372,7 +379,7 @@ struct MurmurHash3_32 { using result_type = hash_value_type; MurmurHash3_32() = default; - CUDA_HOST_DEVICE_CALLABLE MurmurHash3_32(uint32_t seed) : m_seed(seed) {} + constexpr MurmurHash3_32(uint32_t seed) : m_seed(seed) {} CUDA_DEVICE_CALLABLE uint32_t rotl32(uint32_t x, int8_t r) const { @@ -469,7 +476,7 @@ struct MurmurHash3_32 { } private: - uint32_t m_seed{0}; + uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; }; template <> @@ -564,7 +571,7 @@ struct SparkMurmurHash3_32 { using result_type = hash_value_type; SparkMurmurHash3_32() = default; - CUDA_HOST_DEVICE_CALLABLE SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {} + constexpr SparkMurmurHash3_32(uint32_t seed) : m_seed(seed) {} CUDA_DEVICE_CALLABLE uint32_t rotl32(uint32_t x, int8_t r) const { @@ -636,7 +643,7 @@ struct SparkMurmurHash3_32 { } private: - uint32_t m_seed{0}; + uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; }; template <> @@ -772,6 +779,8 @@ SparkMurmurHash3_32::operator()(double const& key) const template struct IdentityHash { using result_type = hash_value_type; + IdentityHash() = default; + constexpr IdentityHash(uint32_t seed) : m_seed(seed) {} /** * @brief Combines two hash values into a new single hash value. Called @@ -784,7 +793,7 @@ struct IdentityHash { * * @returns A hash value that intelligently combines the lhs and rhs hash values */ - CUDA_HOST_DEVICE_CALLABLE result_type hash_combine(result_type lhs, result_type rhs) const + constexpr result_type hash_combine(result_type lhs, result_type rhs) const { result_type combined{lhs}; @@ -794,19 +803,22 @@ struct IdentityHash { } template - CUDA_HOST_DEVICE_CALLABLE std::enable_if_t::value, return_type> - operator()(Key const& key) const + constexpr std::enable_if_t::value, return_type> operator()( + Key const& key) const { cudf_assert(false && "IdentityHash does not support this data type"); return 0; } template - CUDA_HOST_DEVICE_CALLABLE std::enable_if_t::value, return_type> - operator()(Key const& key) const + constexpr std::enable_if_t::value, return_type> operator()( + Key const& key) const { return static_cast(key); } + + private: + uint32_t m_seed{cudf::DEFAULT_HASH_SEED}; }; template diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 3f95b8b417b..0fb5002a953 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -39,7 +39,7 @@ std::unique_ptr hash( table_view const& input, hash_id hash_function = hash_id::HASH_MURMUR3, std::vector const& initial_hash = {}, - uint32_t seed = 0, + uint32_t seed = DEFAULT_HASH_SEED, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp index ddde26ec762..6b1ad7db08b 100644 --- a/cpp/include/cudf/partitioning.hpp +++ b/cpp/include/cudf/partitioning.hpp @@ -83,6 +83,9 @@ std::pair, std::vector> partition( * @param input The table to partition * @param columns_to_hash Indices of input columns to hash * @param num_partitions The number of partitions to use + * @param hash_function Optional hash id that chooses the hash function to use + * @param seed Optional seed value to the hash function + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table's device memory. * * @returns An output table and a vector of row offsets to each partition @@ -92,6 +95,7 @@ std::pair, std::vector> hash_partition( std::vector const& columns_to_hash, int num_partitions, hash_id hash_function = hash_id::HASH_MURMUR3, + uint32_t seed = DEFAULT_HASH_SEED, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh index decd2879f54..61d714c5538 100644 --- a/cpp/include/cudf/table/row_operators.cuh +++ b/cpp/include/cudf/table/row_operators.cuh @@ -428,6 +428,7 @@ template