From 09995a5d3f5b1dd83584529e44ccf774d7f6efe2 Mon Sep 17 00:00:00 2001 From: ChrisJar Date: Wed, 27 Apr 2022 13:13:29 -0500 Subject: [PATCH 01/28] Add bindings for index_of with column search key (#10696) This adds bindings for `index_of` to enable using `list.index` with a Series of search keys. Closes #10692 cc: @randerzander Authors: - https://github.com/ChrisJar Approvers: - Ashwin Srinath (https://github.com/shwina) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10696 --- python/cudf/cudf/_lib/cpp/lists/contains.pxd | 5 ++ python/cudf/cudf/_lib/lists.pyx | 20 ++++++- python/cudf/cudf/core/column/lists.py | 61 ++++++++++++++++++-- python/cudf/cudf/tests/test_list.py | 55 ++++++++++++++++-- 4 files changed, 129 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/lists/contains.pxd b/python/cudf/cudf/_lib/cpp/lists/contains.pxd index 46aea37643f..e3cb01721a0 100644 --- a/python/cudf/cudf/_lib/cpp/lists/contains.pxd +++ b/python/cudf/cudf/_lib/cpp/lists/contains.pxd @@ -18,3 +18,8 @@ cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil: lists_column_view lists, scalar search_key, ) except + + + cdef unique_ptr[column] index_of( + lists_column_view lists, + column_view search_keys, + ) except + diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index e5a705ab603..025fb0665d3 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -176,7 +176,7 @@ def contains_scalar(Column col, object py_search_key): return result -def index_of(Column col, object py_search_key): +def index_of_scalar(Column col, object py_search_key): cdef DeviceScalar search_key = py_search_key.device_value @@ -195,6 +195,24 @@ def index_of(Column col, object py_search_key): return Column.from_unique_ptr(move(c_result)) +def index_of_column(Column col, Column search_keys): + + cdef column_view keys_view = search_keys.view() + + cdef shared_ptr[lists_column_view] list_view = ( + make_shared[lists_column_view](col.view()) + ) + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_index_of( + list_view.get()[0], + keys_view, + )) + return Column.from_unique_ptr(move(c_result)) + + def concatenate_rows(list source_columns): cdef unique_ptr[column] c_result diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index df6aaa91a2b..2964378d114 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -17,7 +17,8 @@ drop_list_duplicates, extract_element_column, extract_element_scalar, - index_of, + index_of_column, + index_of_scalar, sort_lists, ) from cudf._lib.strings.convert.convert_lists import format_list_column @@ -463,10 +464,61 @@ def contains(self, search_key: ScalarLike) -> ParentType: raise return res - def index(self, search_key: ScalarLike) -> ParentType: - search_key = cudf.Scalar(search_key) + def index(self, search_key: Union[ScalarLike, ColumnLike]) -> ParentType: + """ + Returns integers representing the index of the search key for each row. + + If ``search_key`` is a sequence, it must be the same length as the + Series and ``search_key[i]`` represents the search key for the + ``i``-th row of the Series. + + If the search key is not contained in a row, -1 is returned. If either + the row or the search key are null, is returned. If the search key + is contained multiple times, the smallest matching index is returned. + + Parameters + ---------- + search_key : scalar or sequence of scalars + Element or elements being searched for in each row of the list + column + + Returns + ------- + Series or Index + + Examples + -------- + >>> s = cudf.Series([[1, 2, 3], [3, 4, 5], [4, 5, 6]]) + >>> s.list.index(4) + 0 -1 + 1 1 + 2 0 + dtype: int32 + + >>> s = cudf.Series([["a", "b", "c"], ["x", "y", "z"]]) + >>> s.list.index(["b", "z"]) + 0 1 + 1 2 + dtype: int32 + + >>> s = cudf.Series([[4, 5, 6], None, [-3, -2, -1]]) + >>> s.list.index([None, 3, -2]) + 0 + 1 + 2 1 + dtype: int32 + """ + try: - res = self._return_or_inplace(index_of(self._column, search_key)) + if is_scalar(search_key): + return self._return_or_inplace( + index_of_scalar(self._column, cudf.Scalar(search_key)) + ) + else: + return self._return_or_inplace( + index_of_column(self._column, as_column(search_key)) + ) + except RuntimeError as e: if ( "Type/Scale of search key does not " @@ -474,7 +526,6 @@ def index(self, search_key: ScalarLike) -> ParentType: ): raise TypeError(str(e)) from e raise - return res @property def leaves(self) -> ParentType: diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index c21e1a0f61f..09eee3520e5 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -11,6 +11,7 @@ import cudf from cudf import NA from cudf._lib.copying import get_element +from cudf.api.types import is_scalar from cudf.testing._utils import ( DATETIME_TYPES, NUMERIC_TYPES, @@ -425,7 +426,7 @@ def test_contains_invalid(data, scalar): @pytest.mark.parametrize( - "data, scalar, expect", + "data, search_key, expect", [ ( [[1, 2, 3], [], [3, 4, 5]], @@ -448,6 +449,16 @@ def test_contains_invalid(data, scalar): "y", [3, -1], ), + ( + [["h", "a", None], ["t", "g"]], + ["a", "b"], + [1, -1], + ), + ( + [None, ["h", "i"], ["p", "k", "z"]], + ["x", None, "z"], + [None, None, 2], + ), ( [["d", None, "e"], [None, "f"], []], cudf.Scalar(cudf.NA, "O"), @@ -460,15 +471,21 @@ def test_contains_invalid(data, scalar): ), ], ) -def test_index(data, scalar, expect): +def test_index(data, search_key, expect): sr = cudf.Series(data) expect = cudf.Series(expect, dtype="int32") - got = sr.list.index(cudf.Scalar(scalar, sr.dtype.element_type)) + if is_scalar(search_key): + got = sr.list.index(cudf.Scalar(search_key, sr.dtype.element_type)) + else: + got = sr.list.index( + cudf.Series(search_key, dtype=sr.dtype.element_type) + ) + assert_eq(expect, got) @pytest.mark.parametrize( - "data, scalar", + "data, search_key", [ ( [[9, None, 8], [], [7, 6, 5]], @@ -478,16 +495,42 @@ def test_index(data, scalar, expect): [["a", "b", "c"], None, [None, "d"]], 2, ), + ( + [["e", "s"], ["t", "w"]], + [5, 6], + ), ], ) -def test_index_invalid(data, scalar): +def test_index_invalid_type(data, search_key): sr = cudf.Series(data) with pytest.raises( TypeError, match="Type/Scale of search key does not " "match list column element type.", ): - sr.list.index(scalar) + sr.list.index(search_key) + + +@pytest.mark.parametrize( + "data, search_key", + [ + ( + [[5, 8], [2, 6]], + [8, 2, 4], + ), + ( + [["h", "j"], ["p", None], ["t", "z"]], + ["j", "a"], + ), + ], +) +def test_index_invalid_length(data, search_key): + sr = cudf.Series(data) + with pytest.raises( + RuntimeError, + match="Number of search keys must match list column size.", + ): + sr.list.index(search_key) @pytest.mark.parametrize( From 1f8a03e69704562dfac38de40b7172650280c6ea Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Wed, 27 Apr 2022 23:57:20 +0530 Subject: [PATCH 02/28] Replace std::make_pair with std::pair (C++17 CTAD) (#10727) Addresses part of https://github.com/rapidsai/cudf/issues/10350 Take advantage of C++17 feature CTAD. Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10727 --- cpp/benchmarks/reduction/segment_reduce.cu | 2 +- cpp/docs/DEVELOPER_GUIDE.md | 4 ++-- cpp/include/cudf/detail/null_mask.cuh | 2 +- cpp/include/cudf/detail/valid_if.cuh | 2 +- cpp/include/cudf/strings/detail/utilities.cuh | 2 +- .../cudf/table/experimental/row_operators.cuh | 13 ++++++------- cpp/include/cudf_test/column_wrapper.hpp | 16 ++++++++-------- cpp/src/bitmask/null_mask.cu | 8 ++++---- cpp/src/copying/contiguous_split.cu | 4 ++-- cpp/src/groupby/groupby.cu | 17 ++++++++--------- cpp/src/groupby/hash/groupby.cu | 2 +- cpp/src/groupby/sort/aggregate.cpp | 2 +- cpp/src/groupby/sort/group_collect.cu | 7 +++---- cpp/src/groupby/sort/scan.cpp | 2 +- cpp/src/io/orc/aggregate_orc_metadata.cpp | 6 +++--- cpp/src/io/orc/reader_impl.cu | 18 +++++++++--------- cpp/src/io/parquet/chunk_dict.cu | 2 +- cpp/src/io/parquet/writer_impl.cu | 8 ++++---- cpp/src/join/conditional_join.cu | 16 ++++++++-------- cpp/src/join/hash_join.cu | 12 ++++++------ cpp/src/join/join.cu | 2 +- cpp/src/join/join_utils.cu | 4 ++-- cpp/src/join/mixed_join.cu | 16 ++++++++-------- .../lists/combine/concatenate_list_elements.cu | 2 +- cpp/src/lists/copying/scatter_helper.cu | 6 +++--- cpp/src/partitioning/partitioning.cu | 11 +++++------ cpp/src/partitioning/round_robin.cu | 10 +++++----- cpp/src/replace/clamp.cu | 2 +- cpp/src/strings/convert/convert_datetime.cu | 2 +- cpp/src/strings/json/json_path.cu | 4 ++-- cpp/src/strings/repeat_strings.cu | 6 +++--- cpp/src/structs/utilities.cpp | 6 +++--- cpp/src/text/subword/data_normalizer.cu | 8 ++++---- cpp/src/transform/bools_to_mask.cu | 8 ++++---- cpp/src/transform/encode.cu | 2 +- cpp/src/transform/nans_to_nulls.cu | 8 +++----- cpp/src/transform/one_hot_encode.cu | 10 ++++------ cpp/src/transpose/transpose.cu | 4 ++-- cpp/tests/groupby/m2_tests.cpp | 5 ++--- cpp/tests/groupby/merge_lists_tests.cpp | 5 ++--- cpp/tests/groupby/merge_m2_tests.cpp | 12 +++++------- cpp/tests/groupby/merge_sets_tests.cpp | 5 ++--- cpp/tests/interop/to_arrow_test.cpp | 2 +- cpp/tests/join/conditional_join_tests.cu | 6 +++--- cpp/tests/join/join_tests.cpp | 2 +- cpp/tests/join/mixed_join_tests.cu | 6 +++--- cpp/tests/merge/merge_test.cpp | 4 ++-- cpp/tests/search/search_struct_test.cpp | 2 +- .../stream_compaction/distinct_count_tests.cpp | 2 +- .../stream_compaction/unique_count_tests.cpp | 2 +- cpp/tests/strings/translate_tests.cpp | 2 +- 51 files changed, 149 insertions(+), 162 deletions(-) diff --git a/cpp/benchmarks/reduction/segment_reduce.cu b/cpp/benchmarks/reduction/segment_reduce.cu index 3723147d95c..08fc4622b43 100644 --- a/cpp/benchmarks/reduction/segment_reduce.cu +++ b/cpp/benchmarks/reduction/segment_reduce.cu @@ -82,7 +82,7 @@ std::pair, thrust::device_vector> make_test_d thrust::device_vector d_offsets(offset_it, offset_it + num_segments + 1); - return std::make_pair(std::move((input->release())[0]), d_offsets); + return std::pair(std::move((input->release())[0]), d_offsets); } template diff --git a/cpp/docs/DEVELOPER_GUIDE.md b/cpp/docs/DEVELOPER_GUIDE.md index 165edd443f6..84f69f559a8 100644 --- a/cpp/docs/DEVELOPER_GUIDE.md +++ b/cpp/docs/DEVELOPER_GUIDE.md @@ -572,7 +572,7 @@ The preferred style for how inputs are passed in and outputs are returned is the Sometimes it is necessary for functions to have multiple outputs. There are a few ways this can be done in C++ (including creating a `struct` for the output). One convenient way to do this is -using `std::tie` and `std::make_pair`. Note that objects passed to `std::make_pair` will invoke +using `std::tie` and `std::pair`. Note that objects passed to `std::pair` will invoke either the copy constructor or the move constructor of the object, and it may be preferable to move non-trivially copyable objects (and required for types with deleted copy constructors, like `std::unique_ptr`). @@ -585,7 +585,7 @@ std::pair return_two_tables(void){ // Do stuff with out0, out1 // Return a std::pair of the two outputs - return std::make_pair(std::move(out0), std::move(out1)); + return std::pair(std::move(out0), std::move(out1)); } cudf::table out0; diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index 7aec56fdc51..6a6cdd43004 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -133,7 +133,7 @@ std::pair bitmask_binop( stream, mr); - return std::make_pair(std::move(dest_mask), null_count); + return std::pair(std::move(dest_mask), null_count); } /** diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh index aa4421bb4ed..f91f51b2161 100644 --- a/cpp/include/cudf/detail/valid_if.cuh +++ b/cpp/include/cudf/detail/valid_if.cuh @@ -110,7 +110,7 @@ std::pair valid_if( null_count = size - valid_count.value(stream); } - return std::make_pair(std::move(null_mask), null_count); + return std::pair(std::move(null_mask), null_count); } /** diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh index bb7f29a4172..e6dba5147b5 100644 --- a/cpp/include/cudf/strings/detail/utilities.cuh +++ b/cpp/include/cudf/strings/detail/utilities.cuh @@ -156,7 +156,7 @@ auto make_strings_children( for_each_fn(size_and_exec_fn); } - return std::make_pair(std::move(offsets_column), std::move(chars_column)); + return std::pair(std::move(offsets_column), std::move(chars_column)); } /** diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 88e31744fdf..32b71e660ac 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -165,14 +165,13 @@ class device_row_comparator { bool const rhs_is_null{_rhs.is_null(rhs_element_index)}; if (lhs_is_null or rhs_is_null) { // at least one is null - return cuda::std::make_pair(null_compare(lhs_is_null, rhs_is_null, _null_precedence), - _depth); + return cuda::std::pair(null_compare(lhs_is_null, rhs_is_null, _null_precedence), _depth); } } - return cuda::std::make_pair(relational_compare(_lhs.element(lhs_element_index), - _rhs.element(rhs_element_index)), - std::numeric_limits::max()); + return cuda::std::pair(relational_compare(_lhs.element(lhs_element_index), + _rhs.element(rhs_element_index)), + std::numeric_limits::max()); } template {{}, -1}, - [](auto acc, lists_column_wrapper const& lcw) { - return lcw.depth > acc.second ? std::make_pair(lcw.get_view(), lcw.depth) : acc; - }); + auto const hierarchy_and_depth = + std::accumulate(elements.begin(), + elements.end(), + std::pair{{}, -1}, + [](auto acc, lists_column_wrapper const& lcw) { + return lcw.depth > acc.second ? std::pair(lcw.get_view(), lcw.depth) : acc; + }); column_view expected_hierarchy = hierarchy_and_depth.first; int32_t const expected_depth = hierarchy_and_depth.second; diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 756cf3421c9..ec14f8e6ded 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -445,7 +445,7 @@ std::pair bitmask_and(table_view const& view, CUDF_FUNC_RANGE(); rmm::device_buffer null_mask{0, stream, mr}; if (view.num_rows() == 0 or view.num_columns() == 0) { - return std::make_pair(std::move(null_mask), 0); + return std::pair(std::move(null_mask), 0); } std::vector masks; @@ -467,7 +467,7 @@ std::pair bitmask_and(table_view const& view, mr); } - return std::make_pair(std::move(null_mask), 0); + return std::pair(std::move(null_mask), 0); } // Returns the bitwise OR of the null masks of all columns in the table view @@ -478,7 +478,7 @@ std::pair bitmask_or(table_view const& view, CUDF_FUNC_RANGE(); rmm::device_buffer null_mask{0, stream, mr}; if (view.num_rows() == 0 or view.num_columns() == 0) { - return std::make_pair(std::move(null_mask), 0); + return std::pair(std::move(null_mask), 0); } std::vector masks; @@ -500,7 +500,7 @@ std::pair bitmask_or(table_view const& view, mr); } - return std::make_pair(std::move(null_mask), 0); + return std::pair(std::move(null_mask), 0); } } // namespace detail diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 514374d450d..35e7eba974f 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -688,9 +688,9 @@ BufInfo build_output_columns(InputIter begin, ? 0 : (current_info->num_rows - current_info->valid_count); ++current_info; - return std::make_pair(ptr, null_count); + return std::pair(ptr, null_count); } - return std::make_pair(static_cast(nullptr), 0); + return std::pair(static_cast(nullptr), 0); }(); // size/data pointer for the column diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 57bb222aaa0..79882239b38 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -83,8 +83,7 @@ std::pair, std::vector> groupby::disp "Unsupported groupby key type does not support equality comparison"); auto [grouped_keys, results] = detail::hash::groupby(flattened_keys, requests, _include_null_keys, stream, mr); - return std::make_pair(unflatten_nested_columns(std::move(grouped_keys), _keys), - std::move(results)); + return std::pair(unflatten_nested_columns(std::move(grouped_keys), _keys), std::move(results)); } else { return sort_aggregate(requests, stream, mr); } @@ -193,7 +192,7 @@ std::pair, std::vector> groupby::aggr verify_valid_requests(requests); - if (_keys.num_rows() == 0) { return std::make_pair(empty_like(_keys), empty_results(requests)); } + if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); } return dispatch_aggregation(requests, rmm::cuda_stream_default, mr); } @@ -211,7 +210,7 @@ std::pair, std::vector> groupby::scan verify_valid_requests(requests); - if (_keys.num_rows() == 0) { return std::make_pair(empty_like(_keys), empty_results(requests)); } + if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); } return sort_scan(requests, rmm::cuda_stream_default, mr); } @@ -250,7 +249,7 @@ std::pair, std::unique_ptr> groupby::replace_nulls CUDF_EXPECTS(static_cast(replace_policies.size()) == values.num_columns(), "Size mismatch between num_columns and replace_policies."); - if (values.is_empty()) { return std::make_pair(empty_like(_keys), empty_like(values)); } + if (values.is_empty()) { return std::pair(empty_like(_keys), empty_like(values)); } auto const stream = rmm::cuda_stream_default; auto const& group_labels = helper().group_labels(stream); @@ -269,8 +268,8 @@ std::pair, std::unique_ptr
> groupby::replace_nulls : std::move(grouped_values); }); - return std::make_pair(std::move(helper().sorted_keys(stream, mr)), - std::make_unique
(std::move(results))); + return std::pair(std::move(helper().sorted_keys(stream, mr)), + std::make_unique
(std::move(results))); } // Get the sort helper object @@ -310,8 +309,8 @@ std::pair, std::unique_ptr
> groupby::shift( grouped_values->view(), group_offsets, offsets[i], fill_values[i].get(), stream, mr); }); - return std::make_pair(helper().sorted_keys(stream, mr), - std::make_unique(std::move(results))); + return std::pair(helper().sorted_keys(stream, mr), + std::make_unique(std::move(results))); } } // namespace groupby diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index f225afaec71..e22b3a4f3a4 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -672,7 +672,7 @@ std::pair, std::vector> groupby( std::unique_ptr
unique_keys = groupby(keys, requests, &cache, has_nulls(keys), include_null_keys, stream, mr); - return std::make_pair(std::move(unique_keys), extract_results(requests, cache, stream, mr)); + return std::pair(std::move(unique_keys), extract_results(requests, cache, stream, mr)); } } // namespace hash } // namespace detail diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 4904aa42723..02036ff0bbf 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -778,7 +778,7 @@ std::pair, std::vector> groupby::sort auto results = detail::extract_results(requests, cache, stream, mr); - return std::make_pair(helper().unique_keys(stream, mr), std::move(results)); + return std::pair(helper().unique_keys(stream, mr), std::move(results)); } } // namespace groupby } // namespace cudf diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu index 8b8a03f35a5..000a595ea2f 100644 --- a/cpp/src/groupby/sort/group_collect.cu +++ b/cpp/src/groupby/sort/group_collect.cu @@ -82,8 +82,7 @@ std::pair, std::unique_ptr> purge_null_entries( auto null_purged_offsets = strings::detail::make_offsets_child_column( null_purged_sizes.cbegin(), null_purged_sizes.cend(), stream, mr); - return std::make_pair, std::unique_ptr>( - std::move(null_purged_values), std::move(null_purged_offsets)); + return std::pair(std::move(null_purged_values), std::move(null_purged_offsets)); } std::unique_ptr group_collect(column_view const& values, @@ -109,8 +108,8 @@ std::unique_ptr group_collect(column_view const& values, return cudf::groupby::detail::purge_null_entries( values, offsets_column->view(), num_groups, stream, mr); } else { - return std::make_pair(std::make_unique(values, stream, mr), - std::move(offsets_column)); + return std::pair(std::make_unique(values, stream, mr), + std::move(offsets_column)); } }(); diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp index 8c4959da35b..20edc1b3f50 100644 --- a/cpp/src/groupby/sort/scan.cpp +++ b/cpp/src/groupby/sort/scan.cpp @@ -185,7 +185,7 @@ std::pair, std::vector> groupby::sort auto results = detail::extract_results(requests, cache, stream, mr); - return std::make_pair(helper().sorted_keys(stream, mr), std::move(results)); + return std::pair(helper().sorted_keys(stream, mr), std::move(results)); } } // namespace groupby } // namespace cudf diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp index a4ae9999a19..47244279599 100644 --- a/cpp/src/io/orc/aggregate_orc_metadata.cpp +++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -177,7 +177,7 @@ std::vector aggregate_orc_metadata::select_stri per_file_metadata[src_file_idx].ff.stripes.size()), "Invalid stripe index"); stripe_infos.push_back( - std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr)); + std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr)); row_count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows; } selected_stripes_mapping.push_back({static_cast(src_file_idx), stripe_infos}); @@ -206,7 +206,7 @@ std::vector aggregate_orc_metadata::select_stri count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows; if (count > row_start || count == 0) { stripe_infos.push_back( - std::make_pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr)); + std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr)); } else { stripe_skip_rows = count; } diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index 83c23774362..a768d568178 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -108,20 +108,20 @@ constexpr std::pair get_index_type_and_pos( case orc::DATA: skip_count += 1; skip_count |= (skip_count & 0xff) << 8; - return std::make_pair(gpu::CI_DATA, skip_count); + return std::pair(gpu::CI_DATA, skip_count); case orc::LENGTH: case orc::SECONDARY: skip_count += 1; skip_count |= (skip_count & 0xff) << 16; - return std::make_pair(gpu::CI_DATA2, skip_count); - case orc::DICTIONARY_DATA: return std::make_pair(gpu::CI_DICTIONARY, skip_count); + return std::pair(gpu::CI_DATA2, skip_count); + case orc::DICTIONARY_DATA: return std::pair(gpu::CI_DICTIONARY, skip_count); case orc::PRESENT: skip_count += (non_child ? 1 : 0); - return std::make_pair(gpu::CI_PRESENT, skip_count); - case orc::ROW_INDEX: return std::make_pair(gpu::CI_INDEX, skip_count); + return std::pair(gpu::CI_PRESENT, skip_count); + case orc::ROW_INDEX: return std::pair(gpu::CI_INDEX, skip_count); default: // Skip this stream as it's not strictly required - return std::make_pair(gpu::CI_NUM_STREAMS, 0); + return std::pair(gpu::CI_NUM_STREAMS, 0); } } @@ -1120,9 +1120,9 @@ table_with_metadata reader::impl::read(size_type skip_rows, if (_metadata.per_file_metadata[stripe_source_mapping.source_idx] .source->is_device_read_preferred(len)) { read_tasks.push_back( - std::make_pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx] - .source->device_read_async(offset, len, d_dst, stream), - len)); + std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx] + .source->device_read_async(offset, len, d_dst, stream), + len)); } else { const auto buffer = diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 9075a319ab3..93e76a6ac23 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -75,7 +75,7 @@ struct map_insert_fn { if constexpr (column_device_view::has_element_accessor()) { auto hash_fn = hash_functor{col}; auto equality_fn = equality_functor{col}; - return map.insert(std::make_pair(i, i), hash_fn, equality_fn); + return map.insert(std::pair(i, i), hash_fn, equality_fn); } else { CUDF_UNREACHABLE("Unsupported type to insert in map"); } diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 92d436e4566..75a50714407 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -876,7 +876,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector& chunks, std::vector> dict_data; std::vector> dict_index; - if (h_chunks.size() == 0) { return std::make_pair(std::move(dict_data), std::move(dict_index)); } + if (h_chunks.size() == 0) { return std::pair(std::move(dict_data), std::move(dict_index)); } // Allocate slots for each chunk std::vector> hash_maps_storage; @@ -912,7 +912,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector& chunks, // We don't use dictionary if the indices are > 16 bits because that's the maximum bitpacking // bitsize we efficiently support - if (nbits > 16) { return std::make_pair(false, 0); } + if (nbits > 16) { return std::pair(false, 0); } // Only these bit sizes are allowed for RLE encoding because it's compute optimized constexpr auto allowed_bitsizes = std::array{1, 2, 4, 8, 12, 16}; @@ -925,7 +925,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector& chunks, bool use_dict = (ck.plain_data_size > dict_enc_size); if (not use_dict) { rle_bits = 0; } - return std::make_pair(use_dict, rle_bits); + return std::pair(use_dict, rle_bits); }(); } @@ -946,7 +946,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector& chunks, gpu::collect_map_entries(chunks.device_view().flat_view(), stream); gpu::get_dictionary_indices(frags, stream); - return std::make_pair(std::move(dict_data), std::move(dict_index)); + return std::pair(std::move(dict_data), std::move(dict_index)); } void writer::impl::init_encoder_pages(hostdevice_2dvector& chunks, diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 9bf7e6a7a43..ae1561b422b 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -59,8 +59,8 @@ conditional_join(table_view const& left, // Inner and left semi joins return empty output because no matches can exist. case join_kind::INNER_JOIN: case join_kind::LEFT_SEMI_JOIN: - return std::make_pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); default: CUDF_FAIL("Invalid join kind."); break; } } else if (left_num_rows == 0) { @@ -70,12 +70,12 @@ conditional_join(table_view const& left, case join_kind::LEFT_ANTI_JOIN: case join_kind::INNER_JOIN: case join_kind::LEFT_SEMI_JOIN: - return std::make_pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); // Full joins need to return the trivial complement. case join_kind::FULL_JOIN: { auto ret_flipped = get_trivial_left_join_indices(right, stream); - return std::make_pair(std::move(ret_flipped.second), std::move(ret_flipped.first)); + return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first)); } default: CUDF_FAIL("Invalid join kind."); break; } @@ -139,8 +139,8 @@ conditional_join(table_view const& left, // all other cases (inner, left semi, and left anti joins) if we reach this // point we can safely return an empty result. if (join_size == 0) { - return std::make_pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); } rmm::device_scalar write_index(0, stream); @@ -176,7 +176,7 @@ conditional_join(table_view const& left, swap_tables); } - auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices)); + auto join_indices = std::pair(std::move(left_indices), std::move(right_indices)); // For full joins, get the indices in the right table that were not joined to // by any row in the left table. diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 086e1e49986..8d2888fd761 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -44,7 +44,7 @@ std::pair, std::unique_ptr
> get_empty_joined_table { std::unique_ptr
empty_probe = empty_like(probe); std::unique_ptr
empty_build = empty_like(build); - return std::make_pair(std::move(empty_probe), std::move(empty_build)); + return std::pair(std::move(empty_probe), std::move(empty_build)); } /** @@ -88,8 +88,8 @@ probe_join_hash_table(cudf::table_device_view build_table, // If output size is zero, return immediately if (join_size == 0) { - return std::make_pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); } auto left_indices = std::make_unique>(join_size, stream, mr); @@ -125,7 +125,7 @@ probe_join_hash_table(cudf::table_device_view build_table, hash_table.pair_retrieve( iter, iter + probe_table_num_rows, out1_zip_begin, out2_zip_begin, equality, stream.value()); } - return std::make_pair(std::move(left_indices), std::move(right_indices)); + return std::pair(std::move(left_indices), std::move(right_indices)); } /** @@ -390,8 +390,8 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe, "Mismatch in number of columns to be joined on"); if (is_trivial_join(flattened_probe_table, _build, JoinKind)) { - return std::make_pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); } CUDF_EXPECTS(std::equal(std::cbegin(_build), diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 7a478ca2eb3..15aed83b641 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -52,7 +52,7 @@ inner_join(table_view const& left_input, if (right.num_rows() > left.num_rows()) { cudf::hash_join hj_obj(left, compare_nulls, stream); auto [right_result, left_result] = hj_obj.inner_join(right, std::nullopt, stream, mr); - return std::make_pair(std::move(left_result), std::move(right_result)); + return std::pair(std::move(left_result), std::move(right_result)); } else { cudf::hash_join hj_obj(right, compare_nulls, stream); return hj_obj.inner_join(left, std::nullopt, stream, mr); diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu index 151db830962..1eb2d4cf4a7 100644 --- a/cpp/src/join/join_utils.cu +++ b/cpp/src/join/join_utils.cu @@ -61,7 +61,7 @@ get_trivial_left_join_indices(table_view const& left, std::make_unique>(left.num_rows(), stream, mr); thrust::uninitialized_fill( rmm::exec_policy(stream), right_indices->begin(), right_indices->end(), JoinNoneValue); - return std::make_pair(std::move(left_indices), std::move(right_indices)); + return std::pair(std::move(left_indices), std::move(right_indices)); } VectorPair concatenate_vector_pairs(VectorPair& a, VectorPair& b, rmm::cuda_stream_view stream) @@ -151,7 +151,7 @@ get_left_join_indices_complement(std::unique_ptr> left_invalid_indices->end(), JoinNoneValue); - return std::make_pair(std::move(left_invalid_indices), std::move(right_indices_complement)); + return std::pair(std::move(left_invalid_indices), std::move(right_indices_complement)); } } // namespace detail diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index f9cbb2b5441..b540c013f47 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -81,8 +81,8 @@ mixed_join( case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left_conditional, stream); // Inner joins return empty output because no matches can exist. case join_kind::INNER_JOIN: - return std::make_pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); default: CUDF_FAIL("Invalid join kind."); break; } } else if (left_num_rows == 0) { @@ -90,12 +90,12 @@ mixed_join( // Left and inner joins all return empty sets. case join_kind::LEFT_JOIN: case join_kind::INNER_JOIN: - return std::make_pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); // Full joins need to return the trivial complement. case join_kind::FULL_JOIN: { auto ret_flipped = get_trivial_left_join_indices(right_conditional, stream); - return std::make_pair(std::move(ret_flipped.second), std::move(ret_flipped.first)); + return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first)); } default: CUDF_FAIL("Invalid join kind."); break; } @@ -208,8 +208,8 @@ mixed_join( // all other cases (inner, left semi, and left anti joins) if we reach this // point we can safely return an empty result. if (join_size == 0) { - return std::make_pair(std::make_unique>(0, stream, mr), - std::make_unique>(0, stream, mr)); + return std::pair(std::make_unique>(0, stream, mr), + std::make_unique>(0, stream, mr)); } // Given the number of matches per row, we need to compute the offsets for insertion. @@ -258,7 +258,7 @@ mixed_join( swap_tables); } - auto join_indices = std::make_pair(std::move(left_indices), std::move(right_indices)); + auto join_indices = std::pair(std::move(left_indices), std::move(right_indices)); // For full joins, get the indices in the right table that were not joined to // by any row in the left table. diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu index fecdec0b1b2..f4d8e7678b1 100644 --- a/cpp/src/lists/combine/concatenate_list_elements.cu +++ b/cpp/src/lists/combine/concatenate_list_elements.cu @@ -81,7 +81,7 @@ std::unique_ptr concatenate_lists_ignore_null(column_view const& input, auto [null_mask, null_count] = [&] { if (!build_null_mask) - return std::make_pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count()); + return std::pair(cudf::detail::copy_bitmask(input, stream, mr), input.null_count()); // The output row will be null only if all lists on the input row are null. auto const lists_dv_ptr = column_device_view::create(lists_column_view(input).child(), stream); diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu index fecf6e1c1a1..7220e8b5980 100644 --- a/cpp/src/lists/copying/scatter_helper.cu +++ b/cpp/src/lists/copying/scatter_helper.cu @@ -175,7 +175,7 @@ struct list_child_constructor { source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable() ? construct_child_nullmask( list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr) - : std::make_pair(rmm::device_buffer{}, 0); + : std::pair(rmm::device_buffer{}, 0); auto child_column = cudf::make_fixed_width_column(source_lists_column_view.child().type(), num_child_rows, @@ -348,7 +348,7 @@ struct list_child_constructor { source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable() ? construct_child_nullmask( list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr) - : std::make_pair(rmm::device_buffer{}, 0); + : std::pair(rmm::device_buffer{}, 0); return cudf::make_lists_column(num_child_rows, std::move(child_offsets), @@ -444,7 +444,7 @@ struct list_child_constructor { source_lists_column_view.child().nullable() || target_lists_column_view.child().nullable() ? construct_child_nullmask( list_vector, list_offsets, source_lists, target_lists, num_child_rows, stream, mr) - : std::make_pair(rmm::device_buffer{}, 0); + : std::pair(rmm::device_buffer{}, 0); return cudf::make_structs_column(num_child_rows, std::move(child_columns), diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu index 09f07a1ca8c..0371065a2e5 100644 --- a/cpp/src/partitioning/partitioning.cu +++ b/cpp/src/partitioning/partitioning.cu @@ -595,8 +595,7 @@ std::pair, std::vector> hash_partition_table( } stream.synchronize(); // Async D2H copy must finish before returning host vec - return std::make_pair(std::make_unique
(std::move(output_cols)), - std::move(partition_offsets)); + return std::pair(std::make_unique
(std::move(output_cols)), std::move(partition_offsets)); } else { // Compute a scatter map from input to output such that the output rows are // sorted by partition number @@ -613,7 +612,7 @@ std::pair, std::vector> hash_partition_table( input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, stream, mr); stream.synchronize(); // Async D2H copy must finish before returning host vec - return std::make_pair(std::move(output), std::move(partition_offsets)); + return std::pair(std::move(output), std::move(partition_offsets)); } } @@ -700,7 +699,7 @@ struct dispatch_map_type { auto scattered = cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, false, stream, mr); - return std::make_pair(std::move(scattered), std::move(partition_offsets)); + return std::pair(std::move(scattered), std::move(partition_offsets)); } template @@ -728,7 +727,7 @@ std::pair, std::vector> hash_partition( // Return empty result if there are no partitions or nothing to hash if (num_partitions <= 0 || input.num_rows() == 0 || table_to_hash.num_columns() == 0) { - return std::make_pair(empty_like(input), std::vector{}); + return std::pair(empty_like(input), std::vector{}); } if (has_nulls(table_to_hash)) { @@ -753,7 +752,7 @@ std::pair, std::vector> partition( CUDF_EXPECTS(not partition_map.has_nulls(), "Unexpected null values in partition_map."); if (num_partitions == 0 or t.num_rows() == 0) { - return std::make_pair(empty_like(t), std::vector{}); + return std::pair(empty_like(t), std::vector{}); } return cudf::type_dispatcher( diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu index 193bb5a4353..9cfad602db0 100644 --- a/cpp/src/partitioning/round_robin.cu +++ b/cpp/src/partitioning/round_robin.cu @@ -104,8 +104,8 @@ std::pair, std::vector> degenerate stream, mr); - return std::make_pair(std::move(uniq_tbl), - cudf::detail::make_std_vector_sync(partition_offsets, stream)); + return std::pair(std::move(uniq_tbl), + cudf::detail::make_std_vector_sync(partition_offsets, stream)); } else { //( num_partitions > nrows ) rmm::device_uvector d_row_indices(nrows, stream); @@ -140,8 +140,8 @@ std::pair, std::vector> degenerate nedges_iter_begin + num_partitions, partition_offsets.begin()); - return std::make_pair(std::move(uniq_tbl), - cudf::detail::make_std_vector_sync(partition_offsets, stream)); + return std::pair(std::move(uniq_tbl), + cudf::detail::make_std_vector_sync(partition_offsets, stream)); } } } // namespace @@ -230,7 +230,7 @@ std::pair, std::vector> round_robin_part auto uniq_tbl = cudf::detail::gather( input, iter_begin, iter_begin + nrows, cudf::out_of_bounds_policy::DONT_CHECK, stream, mr); - auto ret_pair = std::make_pair(std::move(uniq_tbl), std::vector(num_partitions)); + auto ret_pair = std::pair(std::move(uniq_tbl), std::vector(num_partitions)); // this has the effect of rotating the set of partition sizes // right by start_partition positions: diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu index 8b696854c25..73b224b0c99 100644 --- a/cpp/src/replace/clamp.cu +++ b/cpp/src/replace/clamp.cu @@ -76,7 +76,7 @@ std::pair, std::unique_ptr> form_offsets_and_cha cudf::detail::get_value(offsets_column->view(), strings_count, stream); auto chars_column = cudf::strings::detail::create_chars_child_column(bytes, stream, mr); - return std::make_pair(std::move(offsets_column), std::move(chars_column)); + return std::pair(std::move(offsets_column), std::move(chars_column)); } template diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index 70a6252e9b3..9473bed963e 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -1086,7 +1086,7 @@ struct dispatch_from_timestamps_fn { thrust::make_counting_iterator(0), d_timestamps.size(), pfn); - return std::make_pair(std::move(offsets_column), std::move(chars_column)); + return std::pair(std::move(offsets_column), std::move(chars_column)); } template diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index 30e8770c3c2..995b6223ddc 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -670,8 +670,8 @@ std::pair>, int> build_comma auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END; return is_empty - ? std::make_pair(thrust::nullopt, 0) - : std::make_pair( + ? std::pair(thrust::nullopt, 0) + : std::pair( thrust::make_optional(cudf::detail::make_device_uvector_sync(h_operators, stream)), max_stack_depth); } diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index d496b46bc36..7a3e0fb0243 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -283,7 +283,7 @@ auto make_strings_children(Func fn, for_each_fn(fn); } - return std::make_pair(std::move(offsets_column), std::move(chars_column)); + return std::pair(std::move(offsets_column), std::move(chars_column)); } } // namespace @@ -345,7 +345,7 @@ std::pair, int64_t> repeat_strings_output_sizes( auto const strings_count = input.size(); if (strings_count == 0) { - return std::make_pair(make_empty_column(type_to_id()), int64_t{0}); + return std::pair(make_empty_column(type_to_id()), int64_t{0}); } auto output_sizes = make_numeric_column( @@ -374,7 +374,7 @@ std::pair, int64_t> repeat_strings_output_sizes( int64_t{0}, thrust::plus{}); - return std::make_pair(std::move(output_sizes), total_bytes); + return std::pair(std::move(output_sizes), total_bytes); } } // namespace detail diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp index 852a32bed3d..a2c173cae5f 100644 --- a/cpp/src/structs/utilities.cpp +++ b/cpp/src/structs/utilities.cpp @@ -371,7 +371,7 @@ std::tuple> superimpose_paren auto [new_child_mask, null_count] = [&] { if (not child.nullable()) { // Adopt parent STRUCT's null mask. - return std::make_pair(structs_column.null_mask(), 0); + return std::pair(structs_column.null_mask(), 0); } // Both STRUCT and child are nullable. AND() for the child's new null mask. @@ -387,8 +387,8 @@ std::tuple> superimpose_paren stream, mr); ret_validity_buffers.push_back(std::move(new_mask)); - return std::make_pair( - reinterpret_cast(ret_validity_buffers.back().data()), null_count); + return std::pair(reinterpret_cast(ret_validity_buffers.back().data()), + null_count); }(); return cudf::column_view( diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 2ed59c3ae0c..71f9e3f7043 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -278,8 +278,8 @@ uvector_pair data_normalizer::normalize(char const* d_strings, rmm::cuda_stream_view stream) const { if (num_strings == 0) - return std::make_pair(std::make_unique>(0, stream), - std::make_unique>(0, stream)); + return std::pair(std::make_unique>(0, stream), + std::make_unique>(0, stream)); // copy offsets to working memory size_t const num_offsets = num_strings + 1; @@ -294,8 +294,8 @@ uvector_pair data_normalizer::normalize(char const* d_strings, }); uint32_t const bytes_count = d_strings_offsets->element(num_strings, stream); if (bytes_count == 0) // if no bytes, nothing to do - return std::make_pair(std::make_unique>(0, stream), - std::make_unique>(0, stream)); + return std::pair(std::make_unique>(0, stream), + std::make_unique>(0, stream)); cudf::detail::grid_1d const grid{static_cast(bytes_count), THREADS_PER_BLOCK, 1}; size_t const threads_on_device = grid.num_threads_per_block * grid.num_blocks; diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu index 2cf4771890b..a1f49a5685f 100644 --- a/cpp/src/transform/bools_to_mask.cu +++ b/cpp/src/transform/bools_to_mask.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ std::pair, cudf::size_type> bools_to_mask( { CUDF_EXPECTS(input.type().id() == type_id::BOOL8, "Input is not of type bool"); - if (input.is_empty()) { return std::make_pair(std::make_unique(), 0); } + if (input.is_empty()) { return std::pair(std::make_unique(), 0); } auto input_device_view_ptr = column_device_view::create(input, stream); auto input_device_view = *input_device_view_ptr; @@ -45,12 +45,12 @@ std::pair, cudf::size_type> bools_to_mask( auto mask = detail::valid_if(input_begin, input_begin + input.size(), pred, stream, mr); - return std::make_pair(std::make_unique(std::move(mask.first)), mask.second); + return std::pair(std::make_unique(std::move(mask.first)), mask.second); } else { auto mask = detail::valid_if( input_device_view.begin(), input_device_view.end(), pred, stream, mr); - return std::make_pair(std::make_unique(std::move(mask.first)), mask.second); + return std::pair(std::make_unique(std::move(mask.first)), mask.second); } } diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu index 04821b09eab..60769665fca 100644 --- a/cpp/src/transform/encode.cu +++ b/cpp/src/transform/encode.cu @@ -57,7 +57,7 @@ std::pair, std::unique_ptr> encode( auto indices_column = cudf::detail::lower_bound( sorted_unique_keys->view(), input_table, column_order, null_precedence, stream, mr); - return std::make_pair(std::move(sorted_unique_keys), std::move(indices_column)); + return std::pair(std::move(sorted_unique_keys), std::move(indices_column)); } } // namespace detail diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu index ee63e6d366f..42d41b44779 100644 --- a/cpp/src/transform/nans_to_nulls.cu +++ b/cpp/src/transform/nans_to_nulls.cu @@ -53,8 +53,7 @@ struct dispatch_nan_to_null { stream, mr); - return std::make_pair(std::make_unique(std::move(mask.first)), - mask.second); + return std::pair(std::make_unique(std::move(mask.first)), mask.second); } else { auto pred = [input_device_view] __device__(cudf::size_type idx) { return not(std::isnan(input_device_view.element(idx))); @@ -66,8 +65,7 @@ struct dispatch_nan_to_null { stream, mr); - return std::make_pair(std::make_unique(std::move(mask.first)), - mask.second); + return std::pair(std::make_unique(std::move(mask.first)), mask.second); } } @@ -85,7 +83,7 @@ struct dispatch_nan_to_null { std::pair, cudf::size_type> nans_to_nulls( column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - if (input.is_empty()) { return std::make_pair(std::make_unique(), 0); } + if (input.is_empty()) { return std::pair(std::make_unique(), 0); } return cudf::type_dispatcher(input.type(), dispatch_nan_to_null{}, input, stream, mr); } diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu index 16aee349bb5..b1a8858f847 100644 --- a/cpp/src/transform/one_hot_encode.cu +++ b/cpp/src/transform/one_hot_encode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -89,7 +89,7 @@ struct one_hot_encode_launcher { auto views = cudf::split(all_encodings->view(), split_indices); table_view encodings_view{views}; - return std::make_pair(std::move(all_encodings), encodings_view); + return std::pair(std::move(all_encodings), encodings_view); } template , table_view> one_hot_encode(column_view const& { CUDF_EXPECTS(input.type() == categories.type(), "Mismatch type between input and categories."); - if (categories.is_empty()) { - return std::make_pair(make_empty_column(type_id::BOOL8), table_view{}); - } + if (categories.is_empty()) { return std::pair(make_empty_column(type_id::BOOL8), table_view{}); } if (input.is_empty()) { auto empty_data = make_empty_column(type_id::BOOL8); std::vector views(categories.size(), empty_data->view()); - return std::make_pair(std::move(empty_data), table_view{views}); + return std::pair(std::move(empty_data), table_view{views}); } return type_dispatcher(input.type(), one_hot_encode_launcher{}, input, categories, stream, mr); diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu index b5b00b11a0f..a87cf60a252 100644 --- a/cpp/src/transpose/transpose.cu +++ b/cpp/src/transpose/transpose.cu @@ -37,7 +37,7 @@ std::pair, table_view> transpose(table_view const& input { // If there are no rows in the input, return successfully if (input.num_columns() == 0 || input.num_rows() == 0) { - return std::make_pair(std::make_unique(), table_view{}); + return std::pair(std::make_unique(), table_view{}); } // Check datatype homogeneity @@ -54,7 +54,7 @@ std::pair, table_view> transpose(table_view const& input auto splits = std::vector(splits_iter, splits_iter + input.num_rows() - 1); auto output_column_views = split(output_column->view(), splits, stream); - return std::make_pair(std::move(output_column), table_view(output_column_views)); + return std::pair(std::move(output_column), table_view(output_column_views)); } } // namespace detail diff --git a/cpp/tests/groupby/m2_tests.cpp b/cpp/tests/groupby/m2_tests.cpp index be7d6c1ce05..6f5a04e3752 100644 --- a/cpp/tests/groupby/m2_tests.cpp +++ b/cpp/tests/groupby/m2_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,8 +48,7 @@ auto compute_M2(cudf::column_view const& keys, cudf::column_view const& values) auto gb_obj = cudf::groupby::groupby(cudf::table_view({keys})); auto result = gb_obj.aggregate(requests); - return std::make_pair(std::move(result.first->release()[0]), - std::move(result.second[0].results[0])); + return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0])); } } // namespace diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp index 7c24c6267ca..593bb7c50af 100644 --- a/cpp/tests/groupby/merge_lists_tests.cpp +++ b/cpp/tests/groupby/merge_lists_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,8 +47,7 @@ auto merge_lists(vcol_views const& keys_cols, vcol_views const& values_cols) auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys})); auto result = gb_obj.aggregate(requests); - return std::make_pair(std::move(result.first->release()[0]), - std::move(result.second[0].results[0])); + return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0])); } } // namespace diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp index 60067e78022..79ffebf146c 100644 --- a/cpp/tests/groupby/merge_m2_tests.cpp +++ b/cpp/tests/groupby/merge_m2_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,10 +67,9 @@ auto compute_partial_results(cudf::column_view const& keys, cudf::column_view co auto [out_keys, out_results] = gb_obj.aggregate(requests); auto const num_output_rows = out_keys->num_rows(); - return std::make_pair( - std::move(out_keys->release()[0]), - cudf::make_structs_column( - num_output_rows, std::move(out_results[0].results), 0, rmm::device_buffer{})); + return std::pair(std::move(out_keys->release()[0]), + cudf::make_structs_column( + num_output_rows, std::move(out_results[0].results), 0, rmm::device_buffer{})); } /** @@ -93,8 +92,7 @@ auto merge_M2(vcol_views const& keys_cols, vcol_views const& values_cols) auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys})); auto result = gb_obj.aggregate(requests); - return std::make_pair(std::move(result.first->release()[0]), - std::move(result.second[0].results[0])); + return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0])); } } // namespace diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp index 1e2f0c9fa9e..57f67f6b81a 100644 --- a/cpp/tests/groupby/merge_sets_tests.cpp +++ b/cpp/tests/groupby/merge_sets_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,8 +47,7 @@ auto merge_sets(vcol_views const& keys_cols, vcol_views const& values_cols) auto gb_obj = cudf::groupby::groupby(cudf::table_view({*keys})); auto result = gb_obj.aggregate(requests); - return std::make_pair(std::move(result.first->release()[0]), - std::move(result.second[0].results[0])); + return std::pair(std::move(result.first->release()[0]), std::move(result.second[0].results[0])); } } // namespace diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index d1dc60119b6..4b481ade83f 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -148,7 +148,7 @@ std::pair, std::shared_ptr> get_table auto schema = std::make_shared(schema_vector); - return std::make_pair( + return std::pair( std::make_unique(std::move(columns)), arrow::Table::Make( schema, {int64array, string_array, dict_array, boolarray, list_array, struct_array})); diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu index 73b355d496d..13852027bf0 100644 --- a/cpp/tests/join/conditional_join_tests.cu +++ b/cpp/tests/join/conditional_join_tests.cu @@ -93,7 +93,7 @@ std::pair, std::vector> gen_random_repeated_columns( std::mt19937 gen(rd()); std::shuffle(left.begin(), left.end(), gen); std::shuffle(right.begin(), right.end(), gen); - return std::make_pair(std::move(left), std::move(right)); + return std::pair(std::move(left), std::move(right)); } // Generate a single pair of left/right nullable columns of random data @@ -120,8 +120,8 @@ gen_random_nullable_repeated_columns(unsigned int N = 10000, unsigned int num_re return uniform_dist(gen) > 0.5; }); - return std::make_pair(std::make_pair(std::move(left), std::move(left_nulls)), - std::make_pair(std::move(right), std::move(right_nulls))); + return std::pair(std::pair(std::move(left), std::move(left_nulls)), + std::pair(std::move(right), std::move(right_nulls))); } } // namespace diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index f560ce7f20c..8ed50c8fb39 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -67,7 +67,7 @@ struct JoinTest : public cudf::test::BaseFixture { auto gold_sort_order = cudf::sorted_order(gold); auto sorted_gold = cudf::gather(gold, *gold_sort_order); - return std::make_pair(std::move(sorted_gold), std::move(sorted_result)); + return std::pair(std::move(sorted_gold), std::move(sorted_result)); } }; diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu index df5b1f5c14a..edcf1d1be27 100644 --- a/cpp/tests/join/mixed_join_tests.cu +++ b/cpp/tests/join/mixed_join_tests.cu @@ -94,7 +94,7 @@ std::pair, std::vector> gen_random_repeated_columns( std::mt19937 gen(rd()); std::shuffle(left.begin(), left.end(), gen); std::shuffle(right.begin(), right.end(), gen); - return std::make_pair(std::move(left), std::move(right)); + return std::pair(std::move(left), std::move(right)); } // Generate a single pair of left/right nullable columns of random data @@ -121,8 +121,8 @@ gen_random_nullable_repeated_columns(unsigned int N = 10000, unsigned int num_re return uniform_dist(gen) > 0.5; }); - return std::make_pair(std::make_pair(std::move(left), std::move(left_nulls)), - std::make_pair(std::move(right), std::move(right_nulls))); + return std::pair(std::pair(std::move(left), std::move(left_nulls)), + std::pair(std::move(right), std::move(right_nulls))); } } // namespace diff --git a/cpp/tests/merge/merge_test.cpp b/cpp/tests/merge/merge_test.cpp index ea26cad3b59..129d1ad66f3 100644 --- a/cpp/tests/merge/merge_test.cpp +++ b/cpp/tests/merge/merge_test.cpp @@ -652,8 +652,8 @@ TYPED_TEST(MergeTest_, NMerge1KeyColumns) std::vector> facts{}; std::vector tables{}; for (int i = 0; i < num_tables; ++i) { - facts.emplace_back(std::make_pair(PairT0(sequence0, sequence0 + inputRows), - PairT1(sequence1, sequence1 + inputRows))); + facts.emplace_back(std::pair(PairT0(sequence0, sequence0 + inputRows), + PairT1(sequence1, sequence1 + inputRows))); tables.push_back(cudf::table_view{{facts.back().first, facts.back().second}}); } std::vector key_cols{0}; diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp index a1f0b1d81cf..159b082890a 100644 --- a/cpp/tests/search/search_struct_test.cpp +++ b/cpp/tests/search/search_struct_test.cpp @@ -57,7 +57,7 @@ auto search_bounds(cudf::column_view const& t_col_view, auto const values = cudf::table_view{std::vector{values_col->view()}}; auto result_lower_bound = cudf::lower_bound(t, values, column_orders, null_precedence); auto result_upper_bound = cudf::upper_bound(t, values, column_orders, null_precedence); - return std::make_pair(std::move(result_lower_bound), std::move(result_upper_bound)); + return std::pair(std::move(result_lower_bound), std::move(result_upper_bound)); } auto search_bounds(std::unique_ptr const& t_col, diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index 0529539c4b2..31bbd43c78d 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -71,7 +71,7 @@ TYPED_TEST(TypedDistinctCount, TableNoNull) std::vector> pair_input; std::transform( input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) { - return std::make_pair(a, b); + return std::pair(a, b); }); cudf::test::fixed_width_column_wrapper input_col1(input1.begin(), input1.end()); diff --git a/cpp/tests/stream_compaction/unique_count_tests.cpp b/cpp/tests/stream_compaction/unique_count_tests.cpp index 3285cd1a711..591fe042592 100644 --- a/cpp/tests/stream_compaction/unique_count_tests.cpp +++ b/cpp/tests/stream_compaction/unique_count_tests.cpp @@ -71,7 +71,7 @@ TYPED_TEST(TypedUniqueCount, TableNoNull) std::vector> pair_input; std::transform( input1.begin(), input1.end(), input2.begin(), std::back_inserter(pair_input), [](T a, T b) { - return std::make_pair(a, b); + return std::pair(a, b); }); cudf::test::fixed_width_column_wrapper input_col1(input1.begin(), input1.end()); diff --git a/cpp/tests/strings/translate_tests.cpp b/cpp/tests/strings/translate_tests.cpp index e928065dca4..53c6982b880 100644 --- a/cpp/tests/strings/translate_tests.cpp +++ b/cpp/tests/strings/translate_tests.cpp @@ -38,7 +38,7 @@ std::pair make_entry(const char* from, const c cudf::char_utf8 out = 0; cudf::strings::detail::to_char_utf8(from, in); if (to) cudf::strings::detail::to_char_utf8(to, out); - return std::make_pair(in, out); + return std::pair(in, out); } TEST_F(StringsTranslateTest, Translate) From 3d92bf257bcfb46fe5386821d7f81d4b9f4e6dd5 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 27 Apr 2022 17:55:33 -0400 Subject: [PATCH 03/28] Fix scatter for all-empty-string column case (#10724) Closes #10717 Fixes bug introduced with changes in #10673 which uses the `cudf::make_strings_column` that accepts a span of `string_view` objects with a null-placeholder. The placeholder can be unintentionally created in `create_string_vector_from_column` when given a strings column where all the rows are empty. The utility is fixed to prevent creating the placeholder for empty strings. A gtest was added to scatter from/to an all-empty strings column to verify this behavior. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) - Robert Maynard (https://github.com/robertmaynard) URL: https://github.com/rapidsai/cudf/pull/10724 --- cpp/include/cudf/strings/detail/scatter.cuh | 7 ++++++- cpp/src/lists/copying/scatter_helper.cu | 15 +++++++++----- cpp/src/strings/utilities.cu | 22 +++++++++++++-------- cpp/tests/copying/scatter_tests.cpp | 13 +++++++++++- 4 files changed, 42 insertions(+), 15 deletions(-) diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh index f167206f36b..cfede60c771 100644 --- a/cpp/include/cudf/strings/detail/scatter.cuh +++ b/cpp/include/cudf/strings/detail/scatter.cuh @@ -67,8 +67,13 @@ std::unique_ptr scatter( // create vector of string_view's to scatter into rmm::device_uvector target_vector = create_string_vector_from_column(target, stream); + // this ensures empty strings are not mapped to nulls in the make_strings_column function + auto const size = thrust::distance(begin, end); + auto itr = thrust::make_transform_iterator( + begin, [] __device__(string_view const sv) { return sv.empty() ? string_view{} : sv; }); + // do the scatter - thrust::scatter(rmm::exec_policy(stream), begin, end, scatter_map, target_vector.begin()); + thrust::scatter(rmm::exec_policy(stream), itr, itr + size, scatter_map, target_vector.begin()); // build the output column auto sv_span = cudf::device_span(target_vector); diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu index 7220e8b5980..38f738b4035 100644 --- a/cpp/src/lists/copying/scatter_helper.cu +++ b/cpp/src/lists/copying/scatter_helper.cu @@ -232,6 +232,8 @@ struct list_child_constructor { auto string_views = rmm::device_uvector(num_child_rows, stream); + auto const null_string_view = string_view{nullptr, 0}; // placeholder for factory function + thrust::transform( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -241,7 +243,8 @@ struct list_child_constructor { offset_size = list_offsets.size(), d_list_vector = list_vector.begin(), source_lists, - target_lists] __device__(auto index) { + target_lists, + null_string_view] __device__(auto index) { auto const list_index_iter = thrust::upper_bound(thrust::seq, offset_begin, offset_begin + offset_size, index); auto const list_index = @@ -254,14 +257,16 @@ struct list_child_constructor { auto child_strings_column = lists_column.child(); auto strings_offset = lists_offsets_ptr[row_index] + intra_index; - return child_strings_column.is_null(strings_offset) - ? string_view{nullptr, 0} - : child_strings_column.template element(strings_offset); + if (child_strings_column.is_null(strings_offset)) { return null_string_view; } + auto const d_str = child_strings_column.template element(strings_offset); + // ensure a string from an all-empty column is not mapped to the null placeholder + auto const empty_string_view = string_view{}; + return d_str.empty() ? empty_string_view : d_str; }); // string_views should now have been populated with source and target references. auto sv_span = cudf::device_span(string_views); - return cudf::make_strings_column(sv_span, string_view{nullptr, 0}, stream, mr); + return cudf::make_strings_column(sv_span, null_string_view, stream, mr); } /** diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index a7ef2afb47f..ac073f8efbc 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -46,14 +46,20 @@ rmm::device_uvector create_string_vector_from_column( auto strings_vector = rmm::device_uvector(input.size(), stream, mr); - thrust::transform( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(input.size()), - strings_vector.begin(), - [d_strings = *d_strings] __device__(size_type idx) { - return d_strings.is_null(idx) ? string_view{nullptr, 0} : d_strings.element(idx); - }); + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + strings_vector.begin(), + [d_strings = *d_strings] __device__(size_type idx) { + // placeholder for factory function that takes a span of string_views + auto const null_string_view = string_view{nullptr, 0}; + if (d_strings.is_null(idx)) { return null_string_view; } + auto const d_str = d_strings.element(idx); + // special case when the entire column is filled with empty strings: + // here the empty d_str may have a d_str.data() == nullptr + auto const empty_string_view = string_view{}; + return d_str.empty() ? empty_string_view : d_str; + }); return strings_vector; } diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp index 28ebb6cbcb6..306ab8a3d5c 100644 --- a/cpp/tests/copying/scatter_tests.cpp +++ b/cpp/tests/copying/scatter_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -573,6 +573,17 @@ TEST_F(ScatterStringsTests, ScatterScalarNoNulls) CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table); } +TEST_F(ScatterStringsTests, EmptyStrings) +{ + cudf::test::strings_column_wrapper input{"", "", ""}; + cudf::table_view t({input}); + + // Test for issue 10717: all-empty-string column scatter + auto map = cudf::test::fixed_width_column_wrapper({0}); + auto result = cudf::scatter(t, map, t); + CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), t); +} + template class BooleanMaskScatter : public cudf::test::BaseFixture { }; From f0b91179b38ba7224a33a9b1390230f8575f886a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 27 Apr 2022 17:40:48 -0500 Subject: [PATCH 04/28] Revise CONTRIBUTING.md (#10644) I have revised the `CONTRIBUTING.md` file to address several pieces that are out of date. I also revised a good portion of the text and updated external references. Finally, I wrapped the lines at 100 characters to align with other Markdown files in the C++ docs. I would prefer to adopt a convention of one sentence per line if reviewers agree, but went with the 100 character wrapping for now to be consistent with other docs. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - https://github.com/brandon-b-miller - Jason Lowe (https://github.com/jlowe) - Jake Hemstad (https://github.com/jrhemstad) URL: https://github.com/rapidsai/cudf/pull/10644 --- CONTRIBUTING.md | 359 +++++++++++++++++++++++------------------------- 1 file changed, 171 insertions(+), 188 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 6d1c0528832..db8a8d88b99 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,69 +1,79 @@ # Contributing to cuDF -Contributions to cuDF fall into the following three categories. - -1. To report a bug, request a new feature, or report a problem with - documentation, please file an [issue](https://github.com/rapidsai/cudf/issues/new/choose) - describing in detail the problem or new feature. The RAPIDS team evaluates - and triages issues, and schedules them for a release. If you believe the - issue needs priority attention, please comment on the issue to notify the - team. -2. To propose and implement a new Feature, please file a new feature request - [issue](https://github.com/rapidsai/cudf/issues/new/choose). Describe the - intended feature and discuss the design and implementation with the team and - community. Once the team agrees that the plan looks good, go ahead and - implement it, using the [code contributions](#code-contributions) guide below. -3. To implement a feature or bug-fix for an existing outstanding issue, please - Follow the [code contributions](#code-contributions) guide below. If you - need more context on a particular issue, please ask in a comment. - -As contributors and maintainers to this project, -you are expected to abide by cuDF's code of conduct. -More information can be found at: [Contributor Code of Conduct](https://docs.rapids.ai/resources/conduct/). +Contributions to cuDF fall into the following categories: + +1. To report a bug, request a new feature, or report a problem with documentation, please file an + [issue](https://github.com/rapidsai/cudf/issues/new/choose) describing the problem or new feature + in detail. The RAPIDS team evaluates and triages issues, and schedules them for a release. If you + believe the issue needs priority attention, please comment on the issue to notify the team. +2. To propose and implement a new feature, please file a new feature request + [issue](https://github.com/rapidsai/cudf/issues/new/choose). Describe the intended feature and + discuss the design and implementation with the team and community. Once the team agrees that the + plan looks good, go ahead and implement it, using the [code contributions](#code-contributions) + guide below. +3. To implement a feature or bug fix for an existing issue, please follow the [code + contributions](#code-contributions) guide below. If you need more context on a particular issue, + please ask in a comment. + +As contributors and maintainers to this project, you are expected to abide by cuDF's code of +conduct. More information can be found at: +[Contributor Code of Conduct](https://docs.rapids.ai/resources/conduct/). ## Code contributions ### Your first issue -1. Follow the guide at the bottom of this page for [Setting Up Your Build Environment](#setting-up-your-build-environment). -2. Find an issue to work on. The best way is to look for the [good first issue](https://github.com/rapidsai/cudf/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) - or [help wanted](https://github.com/rapidsai/cudf/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) labels. +1. Follow the guide at the bottom of this page for + [Setting up your build environment](#setting-up-your-build-environment). +2. Find an issue to work on. The best way is to look for the + [good first issue](https://github.com/rapidsai/cudf/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) + or [help wanted](https://github.com/rapidsai/cudf/issues?q=is%3Aissue+is%3Aopen+label%3A%22help+wanted%22) + labels. 3. Comment on the issue stating that you are going to work on it. -4. Code! Make sure to update unit tests! -5. When done, [create your pull request](https://github.com/rapidsai/cudf/compare). -6. Verify that CI passes all [status checks](https://help.github.com/articles/about-status-checks/). Fix if needed. -7. Wait for other developers to review your code and update code as needed. -8. Once reviewed and approved, a RAPIDS developer will merge your pull request. - -Remember, if you are unsure about anything, don't hesitate to comment on issues -and ask for clarifications! +4. Create a fork of the cudf repository and check out a branch with a name that + describes your planned work. For example, `fix-documentation`. +5. Write code to address the issue or implement the feature. +6. Add unit tests and unit benchmarks. +7. [Create your pull request](https://github.com/rapidsai/cudf/compare). +8. Verify that CI passes all [status checks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks). + Fix if needed. +9. Wait for other developers to review your code and update code as needed. +10. Once reviewed and approved, a RAPIDS developer will merge your pull request. + +If you are unsure about anything, don't hesitate to comment on issues and ask for clarification! ### Seasoned developers -Once you have gotten your feet wet and are more comfortable with the code, you -can look at the prioritized issues for our next release in our [project boards](https://github.com/rapidsai/cudf/projects). - -> **Pro Tip:** Always look at the release board with the highest number for -issues to work on. This is where RAPIDS developers also focus their efforts. +Once you have gotten your feet wet and are more comfortable with the code, you can look at the +prioritized issues for our next release in our +[project boards](https://github.com/rapidsai/cudf/projects). -Look at the unassigned issues, and find an issue to which you are comfortable -contributing. Start with _Step 3_ above, commenting on the issue to let -others know you are working on it. If you have any questions related to the -implementation of the issue, ask them in the issue instead of the PR. +**Note:** Always look at the release board that is +[currently under development](https://docs.rapids.ai/maintainers) for issues to work on. This is +where RAPIDS developers also focus their efforts. -## Setting Up Your Build Environment +Look at the unassigned issues, and find an issue to which you are comfortable contributing. Start +with _Step 3_ above, commenting on the issue to let others know you are working on it. If you have +any questions related to the implementation of the issue, ask them in the issue instead of the PR. -The following instructions are for developers and contributors to cuDF OSS development. These instructions are tested on Linux Ubuntu 16.04 & 18.04. Use these instructions to build cuDF from source and contribute to its development. Other operating systems may be compatible, but are not currently tested. +## Setting up your build environment +The following instructions are for developers and contributors to cuDF development. These +instructions are tested on Ubuntu Linux LTS releases. Use these instructions to build cuDF from +source and contribute to its development. Other operating systems may be compatible, but are not +currently tested. +Building cudf with the provided conda environment is recommended for users who wish to enable all +library features. The following instructions are for building with a conda environment. Dependencies +for a minimal build of libcudf without using conda are also listed below. ### General requirements Compilers: -* `gcc` version 9.3+ -* `nvcc` version 11.5+ -* `cmake` version 3.20.1+ +* `gcc` version 9.3+ +* `nvcc` version 11.5+ +* `cmake` version 3.20.1+ CUDA/GPU: @@ -71,127 +81,166 @@ CUDA/GPU: * NVIDIA driver 450.80.02+ * Pascal architecture or better -You can obtain CUDA from [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads). +You can obtain CUDA from +[https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads). -### Create the build Environment +### Create the build environment + +- Clone the repository: -- Clone the repository and submodules ```bash CUDF_HOME=$(pwd)/cudf git clone https://github.com/rapidsai/cudf.git $CUDF_HOME cd $CUDF_HOME -git submodule update --init --remote --recursive ``` + +#### Building with a conda environment + +**Note:** Using a conda environment is the easiest way to satisfy the library's dependencies. +Instructions for a minimal build environment without conda are included below. + - Create the conda development environment `cudf_dev`: + ```bash # create the conda environment (assuming in base `cudf` directory) -# note: RAPIDS currently doesn't support `channel_priority: strict`; use `channel_priority: flexible` instead +# note: RAPIDS currently doesn't support `channel_priority: strict`; +# use `channel_priority: flexible` instead conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.5.yml # activate the environment conda activate cudf_dev ``` -- For other CUDA versions, check the corresponding cudf_dev_cuda*.yml file in conda/environments + +- **Note**: the conda environment files are updated frequently, so the + development environment may also need to be updated if dependency versions or + pinnings are changed. + +- For other CUDA versions, check the corresponding `cudf_dev_cuda*.yml` file in + `conda/environments/`. + +#### Building without a conda environment + +- libcudf has the following minimal dependencies (in addition to those listed in the [General + requirements](#general-requirements)). The packages listed below use Ubuntu package names: + + - `build-essential` + - `libssl-dev` + - `libz-dev` + - `libpython3-dev` (required if building cudf) ### Build cuDF from source -- A `build.sh` script is provided in `$CUDF_HOME`. Running the script with no additional arguments will install the `libcudf`, `cudf` and `dask_cudf` libraries. By default, the libraries are installed to the `$CONDA_PREFIX` directory. To install into a different location, set the location in `$INSTALL_PREFIX`. Finally, note that the script depends on the `nvcc` executable being on your path, or defined in `$CUDACXX`. +- A `build.sh` script is provided in `$CUDF_HOME`. Running the script with no additional arguments + will install the `libcudf`, `cudf` and `dask_cudf` libraries. By default, the libraries are + installed to the `$CONDA_PREFIX` directory. To install into a different location, set the location + in `$INSTALL_PREFIX`. Finally, note that the script depends on the `nvcc` executable being on your + path, or defined in `$CUDACXX`. + ```bash cd $CUDF_HOME # Choose one of the following commands, depending on whether -# you want to build and install the libcudf C++ library only, +# you want to build and install the libcudf C++ library only, # or include the cudf and/or dask_cudf Python libraries: ./build.sh # libcudf, cudf and dask_cudf ./build.sh libcudf # libcudf only -./build.sh libcudf cudf # libcudf and cudf only +./build.sh libcudf cudf # libcudf and cudf only ``` -- Other libraries like `cudf-kafka` and `custreamz` can be installed with this script. For the complete list of libraries as well as details about the script usage, run the `help` command: + +- Other libraries like `cudf-kafka` and `custreamz` can be installed with this script. For the + complete list of libraries as well as details about the script usage, run the `help` command: + ```bash -./build.sh --help +./build.sh --help ``` ### Build, install and test cuDF libraries for contributors -The general workflow is provided below. Please, also see the last section about [code formatting](###code-formatting). +The general workflow is provided below. Please also see the last section about +[code formatting](#code-formatting). #### `libcudf` (C++) -If you're only interested in building the library (and not the unit tests): - +- If you're only interested in building the library (and not the unit tests): + ```bash cd $CUDF_HOME ./build.sh libcudf ``` -If, in addition, you want to build tests: + +- If, in addition, you want to build tests: ```bash ./build.sh libcudf tests ``` -To run the tests: + +- To run the tests: ```bash -make test +make test ``` #### `cudf` (Python) - First, build the `libcudf` C++ library following the steps above -- To build and install in edit/develop `cudf` python package: +- To build and install in edit/develop `cudf` Python package: ```bash cd $CUDF_HOME/python/cudf python setup.py build_ext --inplace python setup.py develop ``` -- To run `cudf` tests : +- To run `cudf` tests: ```bash cd $CUDF_HOME/python -py.test -v cudf/cudf/tests +pytest -v cudf/cudf/tests ``` #### `dask-cudf` (Python) - First, build the `libcudf` C++ and `cudf` Python libraries following the steps above -- To install in edit/develop mode the `dask-cudf` python package: +- To install the `dask-cudf` Python package in editable/develop mode: ```bash cd $CUDF_HOME/python/dask_cudf python setup.py build_ext --inplace python setup.py develop ``` -- To run `dask_cudf` tests : +- To run `dask_cudf` tests: ```bash cd $CUDF_HOME/python -py.test -v dask_cudf +pytest -v dask_cudf ``` #### `libcudf_kafka` (C++) -If you're only interested in building the library (and not the unit tests): - +- If you're only interested in building the library (and not the unit tests): + ```bash cd $CUDF_HOME ./build.sh libcudf_kafka ``` -If, in addition, you want to build tests: + +- If, in addition, you want to build tests: ```bash ./build.sh libcudf_kafka tests ``` -To run the tests: + +- To run the tests: ```bash -make test +make test ``` #### `cudf-kafka` (Python) -- First, build the `libcudf` and `libcudf_kafka` following the steps above +- First, build the `libcudf` and `libcudf_kafka` libraries following the steps above + +- To install the `cudf-kafka` Python package in editable/develop mode: -- To install in edit/develop mode the `cudf-kafka` python package: ```bash cd $CUDF_HOME/python/cudf_kafka python setup.py build_ext --inplace @@ -202,7 +251,8 @@ python setup.py develop - First, build `libcudf`, `libcudf_kafka`, and `cudf_kafka` following the steps above -- To install in edit/develop mode the `custreamz` python package: +- To install the `custreamz` Python package in editable/develop mode: + ```bash cd $CUDF_HOME/python/custreamz python setup.py build_ext --inplace @@ -210,40 +260,45 @@ python setup.py develop ``` - To run `custreamz` tests : + ```bash cd $CUDF_HOME/python -py.test -v custreamz +pytest -v custreamz ``` #### `cudf` (Java): - First, build the `libcudf` C++ library following the steps above -- Then, refer to [Java README](https://github.com/rapidsai/cudf/blob/branch-21.10/java/README.md) - +- Then, refer to the [Java README](java/README.md) -Done! You are ready to develop for the cuDF OSS project. But please go to [code formatting](###code-formatting) to ensure that you contributing code follows the expected format. +Done! You are ready to develop for the cuDF project. Please review the project's +[code formatting guidelines](#code-formatting). ## Debugging cuDF -### Building Debug mode from source +### Building in debug mode from source -Follow the [above instructions](####build-cudf-from-source) to build from source and add `-g` to the `./build.sh` command. +Follow the instructions to [build from source](#build-cudf-from-source) and add `-g` to the +`./build.sh` command. For example: + ```bash ./build.sh libcudf -g ``` -This builds `libcudf` in Debug mode which enables some `assert` safety checks and includes symbols in the library for debugging. +This builds `libcudf` in debug mode which enables some `assert` safety checks and includes symbols +in the library for debugging. All other steps for installing `libcudf` into your environment are the same. ### Debugging with `cuda-gdb` and `cuda-memcheck` -When you have a debug build of `libcudf` installed, debugging with the `cuda-gdb` and `cuda-memcheck` is easy. +When you have a debug build of `libcudf` installed, debugging with the `cuda-gdb` and +`cuda-memcheck` is easy. -If you are debugging a Python script, simply run the following: +If you are debugging a Python script, run the following: ```bash cuda-gdb -ex r --args python .py @@ -255,143 +310,71 @@ cuda-memcheck python .py ### Device debug symbols -The device debug symbols are not automatically added with the cmake `Debug` -build type because it causes a runtime delay of several minutes when loading -the libcudf.so library. +The device debug symbols are not automatically added with the cmake `Debug` build type because it +causes a runtime delay of several minutes when loading the libcudf.so library. -Therefore, it is recommended to add device debug symbols only to specific files by -setting the `-G` compile option locally in your `cpp/CMakeLists.txt` for that file. -Here is an example of adding the `-G` option to the compile command for -`src/copying/copy.cu` source file: +Therefore, it is recommended to add device debug symbols only to specific files by setting the `-G` +compile option locally in your `cpp/CMakeLists.txt` for that file. Here is an example of adding the +`-G` option to the compile command for `src/copying/copy.cu` source file: -``` +```cmake set_source_files_properties(src/copying/copy.cu PROPERTIES COMPILE_OPTIONS "-G") ``` -This will add the device debug symbols for this object file in libcudf.so. -You can then use `cuda-dbg` to debug into the kernels in that source file. - -### Building and Testing on a gpuCI image locally - -Before submitting a pull request, you can do a local build and test on your machine that mimics our gpuCI environment using the `ci/local/build.sh` script. -For detailed information on usage of this script, see [here](ci/local/README.md). - +This will add the device debug symbols for this object file in `libcudf.so`. You can then use +`cuda-dbg` to debug into the kernels in that source file. -## Automated Build in Docker Container +## Code Formatting -A Dockerfile is provided with a preconfigured conda environment for building and installing cuDF from source based off of the main branch. +### C++/CUDA -### Prerequisites +cuDF uses [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html). -* Install [nvidia-docker2](https://github.com/nvidia/nvidia-docker/wiki/Installation-(version-2.0)) for Docker + GPU support -* Verify NVIDIA driver is `450.80.02` or higher -* Ensure CUDA 11.0+ is installed - -### Usage +In order to format the C++/CUDA files, navigate to the root (`cudf`) directory and run: -From cudf project root run the following, to build with defaults: -```bash -docker build --tag cudf . -``` -After the container is built run the container: ```bash -docker run --runtime=nvidia -it cudf bash -``` -Activate the conda environment `cudf` to use the newly built cuDF and libcudf libraries: -``` -root@3f689ba9c842:/# source activate cudf -(cudf) root@3f689ba9c842:/# python -c "import cudf" -(cudf) root@3f689ba9c842:/# +python3 ./cpp/scripts/run-clang-format.py -inplace ``` -### Customizing the Build - -Several build arguments are available to customize the build process of the -container. These are specified by using the Docker [build-arg](https://docs.docker.com/engine/reference/commandline/build/#set-build-time-variables---build-arg) -flag. Below is a list of the available arguments and their purpose: +Additionally, many editors have plugins or extensions that you can set up to automatically run +`clang-format` either manually or on file save. -| Build Argument | Default Value | Other Value(s) | Purpose | -| --- | --- | --- | --- | -| `CUDA_VERSION` | 11.0 | 11.2.2 | set CUDA version | -| `LINUX_VERSION` | ubuntu18.04 | ubuntu20.04 | set Ubuntu version | -| `CC` & `CXX` | 9 | 10 | set gcc/g++ version | -| `CUDF_REPO` | This repo | Forks of cuDF | set git URL to use for `git clone` | -| `CUDF_BRANCH` | main | Any branch name | set git branch to checkout of `CUDF_REPO` | -| `NUMBA_VERSION` | newest | >=0.40.0 | set numba version | -| `NUMPY_VERSION` | newest | >=1.14.3 | set numpy version | -| `PANDAS_VERSION` | newest | >=0.23.4 | set pandas version | -| `PYARROW_VERSION` | 1.0.1 | Not supported | set pyarrow version | -| `CMAKE_VERSION` | newest | >=3.18 | set cmake version | -| `CYTHON_VERSION` | 0.29 | Not supported | set Cython version | -| `PYTHON_VERSION` | 3.7 | 3.8 | set python version | +### Python / Pre-commit hooks +cuDF uses [pre-commit](https://pre-commit.com/) to execute code linters and formatters such as +[Black](https://black.readthedocs.io/en/stable/), [isort](https://pycqa.github.io/isort/), and +[flake8](https://flake8.pycqa.org/en/latest/). These tools ensure a consistent code format +throughout the project. Using pre-commit ensures that linter versions and options are aligned for +all developers. Additionally, there is a CI check in place to enforce that committed code follows +our standards. -### Code Formatting - - -#### Python - -cuDF uses [Black](https://black.readthedocs.io/en/stable/), -[isort](https://readthedocs.org/projects/isort/), and -[flake8](http://flake8.pycqa.org/en/latest/) to ensure a consistent code format -throughout the project. They have been installed during the `cudf_dev` environment creation. - -These tools are used to auto-format the Python code, as well as check the Cython -code in the repository. Additionally, there is a CI check in place to enforce -that committed code follows our standards. You can use the tools to -automatically format your python code by running: +To use `pre-commit`, install via `conda` or `pip`: ```bash -isort --atomic python/**/*.py -black python +conda install -c conda-forge pre-commit ``` -and then check the syntax of your Python and Cython code by running: - ```bash -flake8 python -flake8 --config=python/.flake8.cython -``` - -Additionally, many editors have plugins that will apply `isort` and `Black` as -you edit files, as well as use `flake8` to report any style / syntax issues. - -#### C++/CUDA - -cuDF uses [`clang-format`](https://clang.llvm.org/docs/ClangFormat.html) - -In order to format the C++/CUDA files, navigate to the root (`cudf`) directory and run: -``` -python3 ./cpp/scripts/run-clang-format.py -inplace +pip install pre-commit ``` -Additionally, many editors have plugins or extensions that you can set up to automatically run `clang-format` either manually or on file save. - -#### Pre-commit hooks - -Optionally, you may wish to setup [pre-commit hooks](https://pre-commit.com/) -to automatically run `isort`, `Black`, `flake8` and `clang-format` when you make a git commit. -This can be done by installing `pre-commit` via `conda` or `pip`: +Then run pre-commit hooks before committing code: ```bash -conda install -c conda-forge pre_commit +pre-commit run ``` -```bash -pip install pre-commit -``` - -and then running: +Optionally, you may set up the pre-commit hooks to run automatically when you make a git commit. This can be done by running: ```bash pre-commit install ``` -from the root of the cuDF repository. Now `isort`, `Black`, `flake8` and `clang-format` will be -run each time you commit changes. +Now code linters and formatters will be run each time you commit changes. ---- +You can skip these checks with `git commit --no-verify` or with the short version `git commit -n`. ## Attribution + Portions adopted from https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md Portions adopted from https://github.com/dask/dask/blob/master/docs/source/develop.rst From 03d419d96753d29cf24226ab661377da23eef969 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Thu, 28 Apr 2022 08:27:45 -0500 Subject: [PATCH 05/28] Prepare dask_cudf test_parquet.py for upcoming API changes (#10709) This is a relatively-simple PR to clean up `dask_cudf`'s `to/read_parquet` tests. These changes are mostly meant to avoid **future** test failures that will arise after impending changes are implemented in up-stream Dask. These changes include: - The default value for `write_metadata_file` will become `False` for `to_parquet` (because writing the _metadata file scales very poorly) - The default value for `split_row_groups` will become `False` (because this setting is typically optimal when the file are not too large). Users with larger-than-memory files will need to specify `split_row_groups=True/int` explicitly. - The `gather_statistics` argument will be removed in favor of a more descriptive `calculate_divisions` argument. This PR also removes the long-deprecated `row_groups_per_part` argument from `dask_cudf.read_parquet` (established replacement is `split_row_groups`). Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - Ray Douglass (https://github.com/raydouglass) - gpuCI (https://github.com/GPUtester) - Mike Wendt (https://github.com/mike-wendt) - AJ Schmidt (https://github.com/ajschmidt8) Approvers: - Benjamin Zaitlen (https://github.com/quasiben) - GALI PREM SAGAR (https://github.com/galipremsagar) - Randy Gelhausen (https://github.com/randerzander) URL: https://github.com/rapidsai/cudf/pull/10709 --- python/dask_cudf/dask_cudf/io/parquet.py | 212 +++++++++++------- .../dask_cudf/io/tests/test_parquet.py | 106 ++++++--- 2 files changed, 197 insertions(+), 121 deletions(-) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 042759f68cf..b201626becf 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -177,65 +177,98 @@ def read_partition( strings_to_cats = kwargs.get("strings_to_categorical", False) read_kwargs = kwargs.get("read", {}) read_kwargs.update(open_file_options or {}) - - # Assume multi-piece read - paths = [] - rgs = [] - last_partition_keys = None - dfs = [] - - for i, piece in enumerate(pieces): - - (path, row_group, partition_keys) = piece - row_group = None if row_group == [None] else row_group - - if i > 0 and partition_keys != last_partition_keys: - dfs.append( - cls._read_paths( - paths, - fs, - columns=read_columns, - row_groups=rgs if rgs else None, - strings_to_categorical=strings_to_cats, - partitions=partitions, - partitioning=partitioning, - partition_keys=last_partition_keys, - **read_kwargs, + check_file_size = read_kwargs.pop("check_file_size", None) + + # Wrap reading logic in a `try` block so that we can + # inform the user that the `read_parquet` partition + # size is too large for the available memory + try: + + # Assume multi-piece read + paths = [] + rgs = [] + last_partition_keys = None + dfs = [] + + for i, piece in enumerate(pieces): + + (path, row_group, partition_keys) = piece + row_group = None if row_group == [None] else row_group + + # File-size check to help "protect" users from change + # to up-stream `split_row_groups` default. We only + # check the file size if this partition corresponds + # to a full file, and `check_file_size` is defined + if check_file_size and len(pieces) == 1 and row_group is None: + file_size = fs.size(path) + if file_size > check_file_size: + warnings.warn( + f"A large parquet file ({file_size}B) is being " + f"used to create a DataFrame partition in " + f"read_parquet. This may cause out of memory " + f"exceptions in operations downstream. See the " + f"notes on split_row_groups in the read_parquet " + f"documentation. Setting split_row_groups " + f"explicitly will silence this warning." + ) + + if i > 0 and partition_keys != last_partition_keys: + dfs.append( + cls._read_paths( + paths, + fs, + columns=read_columns, + row_groups=rgs if rgs else None, + strings_to_categorical=strings_to_cats, + partitions=partitions, + partitioning=partitioning, + partition_keys=last_partition_keys, + **read_kwargs, + ) ) + paths = rgs = [] + last_partition_keys = None + paths.append(path) + rgs.append( + [row_group] + if not isinstance(row_group, list) + and row_group is not None + else row_group ) - paths = rgs = [] - last_partition_keys = None - paths.append(path) - rgs.append( - [row_group] - if not isinstance(row_group, list) and row_group is not None - else row_group - ) - last_partition_keys = partition_keys + last_partition_keys = partition_keys - dfs.append( - cls._read_paths( - paths, - fs, - columns=read_columns, - row_groups=rgs if rgs else None, - strings_to_categorical=strings_to_cats, - partitions=partitions, - partitioning=partitioning, - partition_keys=last_partition_keys, - **read_kwargs, + dfs.append( + cls._read_paths( + paths, + fs, + columns=read_columns, + row_groups=rgs if rgs else None, + strings_to_categorical=strings_to_cats, + partitions=partitions, + partitioning=partitioning, + partition_keys=last_partition_keys, + **read_kwargs, + ) ) - ) - df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0] - - # Re-set "object" dtypes align with pa schema - set_object_dtypes_from_pa_schema(df, schema) + df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0] - if index and (index[0] in df.columns): - df = df.set_index(index[0]) - elif index is False and df.index.names != (None,): - # If index=False, we shouldn't have a named index - df.reset_index(inplace=True) + # Re-set "object" dtypes align with pa schema + set_object_dtypes_from_pa_schema(df, schema) + + if index and (index[0] in df.columns): + df = df.set_index(index[0]) + elif index is False and df.index.names != (None,): + # If index=False, we shouldn't have a named index + df.reset_index(inplace=True) + + except MemoryError as err: + raise MemoryError( + "Parquet data was larger than the available GPU memory!\n\n" + "See the notes on split_row_groups in the read_parquet " + "documentation.\n\n" + "Original Error: " + str(err) + ) + raise err return df @@ -349,25 +382,34 @@ def set_object_dtypes_from_pa_schema(df, schema): df._data[col_name] = col.astype(typ) -def read_parquet( - path, - columns=None, - split_row_groups=None, - row_groups_per_part=None, - **kwargs, -): +def read_parquet(path, columns=None, **kwargs): """Read parquet files into a Dask DataFrame - Calls ``dask.dataframe.read_parquet`` to cordinate the execution of - ``cudf.read_parquet``, and ultimately read multiple partitions into - a single Dask dataframe. The Dask version must supply an - ``ArrowDatasetEngine`` class to support full functionality. - See ``cudf.read_parquet`` and Dask documentation for further details. + Calls ``dask.dataframe.read_parquet`` with ``engine=CudfEngine`` + to cordinate the execution of ``cudf.read_parquet``, and to + ultimately create a ``dask_cudf.DataFrame`` collection. + + See the ``dask.dataframe.read_parquet`` documentation for + all available options. Examples -------- - >>> import dask_cudf - >>> df = dask_cudf.read_parquet("/path/to/dataset/") # doctest: +SKIP + >>> from dask_cudf import read_parquet + >>> df = read_parquet("/path/to/dataset/") # doctest: +SKIP + + When dealing with one or more large parquet files having an + in-memory footprint >15% device memory, the ``split_row_groups`` + argument should be used to map Parquet **row-groups** to DataFrame + partitions (instead of **files** to partitions). For example, the + following code will map each row-group to a distinct partition: + + >>> df = read_parquet(..., split_row_groups=True) # doctest: +SKIP + + To map **multiple** row-groups to each partition, an integer can be + passed to ``split_row_groups`` to specify the **maximum** number of + row-groups allowed in each output partition: + + >>> df = read_parquet(..., split_row_groups=10) # doctest: +SKIP See Also -------- @@ -376,22 +418,24 @@ def read_parquet( if isinstance(columns, str): columns = [columns] - if row_groups_per_part: - warnings.warn( - "row_groups_per_part is deprecated. " - "Pass an integer value to split_row_groups instead.", - FutureWarning, - ) - if split_row_groups is None: - split_row_groups = row_groups_per_part - - return dd.read_parquet( - path, - columns=columns, - split_row_groups=split_row_groups, - engine=CudfEngine, - **kwargs, - ) + # Set "check_file_size" option to determine whether we + # should check the parquet-file size. This check is meant + # to "protect" users from `split_row_groups` default changes + check_file_size = kwargs.pop("check_file_size", 500_000_000) + if ( + check_file_size + and ("split_row_groups" not in kwargs) + and ("chunksize" not in kwargs) + ): + # User is not specifying `split_row_groups` or `chunksize`, + # so we should warn them if/when a file is ~>0.5GB on disk. + # They can set `split_row_groups` explicitly to silence/skip + # this check + if "read" not in kwargs: + kwargs["read"] = {} + kwargs["read"]["check_file_size"] = check_file_size + + return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs) to_parquet = partial(dd.to_parquet, engine=CudfEngine) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index d9b8ee4595a..ef5741b0539 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -36,42 +36,55 @@ ddf = dd.from_pandas(df, npartitions=npartitions) -@pytest.mark.parametrize("stats", [True, False]) -def test_roundtrip_from_dask(tmpdir, stats): +# Helper function to make it easier to handle the +# upcoming deprecation of `gather_statistics`. +# See: https://github.com/dask/dask/issues/8937 +# TODO: This function should be used to switch to +# the "new" `calculate_divisions` kwarg (for newer +# Dask versions) once it is introduced +def _divisions(setting): + return {"gather_statistics": setting} + + +@pytest.mark.parametrize("write_metadata_file", [True, False]) +@pytest.mark.parametrize("divisions", [True, False]) +def test_roundtrip_from_dask(tmpdir, divisions, write_metadata_file): tmpdir = str(tmpdir) - ddf.to_parquet(tmpdir, engine="pyarrow") + ddf.to_parquet( + tmpdir, write_metadata_file=write_metadata_file, engine="pyarrow" + ) files = sorted( (os.path.join(tmpdir, f) for f in os.listdir(tmpdir)), key=natural_sort_key, ) # Read list of parquet files - ddf2 = dask_cudf.read_parquet(files, gather_statistics=stats) - dd.assert_eq(ddf, ddf2, check_divisions=stats) + ddf2 = dask_cudf.read_parquet(files, **_divisions(divisions)) + dd.assert_eq(ddf, ddf2, check_divisions=divisions) # Specify columns=['x'] ddf2 = dask_cudf.read_parquet( - files, columns=["x"], gather_statistics=stats + files, columns=["x"], **_divisions(divisions) ) - dd.assert_eq(ddf[["x"]], ddf2, check_divisions=stats) + dd.assert_eq(ddf[["x"]], ddf2, check_divisions=divisions) # Specify columns='y' - ddf2 = dask_cudf.read_parquet(files, columns="y", gather_statistics=stats) - dd.assert_eq(ddf[["y"]], ddf2, check_divisions=stats) + ddf2 = dask_cudf.read_parquet(files, columns="y", **_divisions(divisions)) + dd.assert_eq(ddf[["y"]], ddf2, check_divisions=divisions) # Now include metadata - ddf2 = dask_cudf.read_parquet(tmpdir, gather_statistics=stats) - dd.assert_eq(ddf, ddf2, check_divisions=stats) + ddf2 = dask_cudf.read_parquet(tmpdir, **_divisions(divisions)) + dd.assert_eq(ddf, ddf2, check_divisions=divisions) # Specify columns=['x'] (with metadata) ddf2 = dask_cudf.read_parquet( - tmpdir, columns=["x"], gather_statistics=stats + tmpdir, columns=["x"], **_divisions(divisions) ) - dd.assert_eq(ddf[["x"]], ddf2, check_divisions=stats) + dd.assert_eq(ddf[["x"]], ddf2, check_divisions=divisions) # Specify columns='y' (with metadata) - ddf2 = dask_cudf.read_parquet(tmpdir, columns="y", gather_statistics=stats) - dd.assert_eq(ddf[["y"]], ddf2, check_divisions=stats) + ddf2 = dask_cudf.read_parquet(tmpdir, columns="y", **_divisions(divisions)) + dd.assert_eq(ddf[["y"]], ddf2, check_divisions=divisions) def test_roundtrip_from_dask_index_false(tmpdir): @@ -99,8 +112,8 @@ def test_roundtrip_from_dask_cudf(tmpdir, write_meta): gddf = dask_cudf.from_dask_dataframe(ddf) gddf.to_parquet(tmpdir, write_metadata_file=write_meta) - gddf2 = dask_cudf.read_parquet(tmpdir) - dd.assert_eq(gddf, gddf2, check_divisions=write_meta) + gddf2 = dask_cudf.read_parquet(tmpdir, **_divisions(True)) + dd.assert_eq(gddf, gddf2) def test_roundtrip_none_rangeindex(tmpdir): @@ -161,21 +174,21 @@ def test_dask_timeseries_from_pandas(tmpdir): @pytest.mark.parametrize("index", [False, None]) -@pytest.mark.parametrize("stats", [False, True]) -def test_dask_timeseries_from_dask(tmpdir, index, stats): +@pytest.mark.parametrize("divisions", [False, True]) +def test_dask_timeseries_from_dask(tmpdir, index, divisions): fn = str(tmpdir) ddf2 = dask.datasets.timeseries(freq="D") ddf2.to_parquet(fn, engine="pyarrow", write_index=index) - read_df = dask_cudf.read_parquet(fn, index=index, gather_statistics=stats) + read_df = dask_cudf.read_parquet(fn, index=index, **_divisions(divisions)) dd.assert_eq( - ddf2, read_df, check_divisions=(stats and index), check_index=index + ddf2, read_df, check_divisions=(divisions and index), check_index=index ) @pytest.mark.parametrize("index", [False, None]) -@pytest.mark.parametrize("stats", [False, True]) -def test_dask_timeseries_from_daskcudf(tmpdir, index, stats): +@pytest.mark.parametrize("divisions", [False, True]) +def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions): fn = str(tmpdir) ddf2 = dask_cudf.from_cudf( @@ -183,9 +196,9 @@ def test_dask_timeseries_from_daskcudf(tmpdir, index, stats): ) ddf2.name = ddf2.name.astype("object") ddf2.to_parquet(fn, write_index=index) - read_df = dask_cudf.read_parquet(fn, index=index, gather_statistics=stats) + read_df = dask_cudf.read_parquet(fn, index=index, **_divisions(divisions)) dd.assert_eq( - ddf2, read_df, check_divisions=(stats and index), check_index=index + ddf2, read_df, check_divisions=(divisions and index), check_index=index ) @@ -212,17 +225,23 @@ def test_filters(tmpdir): ddf.to_parquet(tmp_path, engine="pyarrow") - a = dask_cudf.read_parquet(tmp_path, filters=[("x", ">", 4)]) + a = dask_cudf.read_parquet( + tmp_path, filters=[("x", ">", 4)], split_row_groups=True + ) assert a.npartitions == 3 assert (a.x > 3).all().compute() - b = dask_cudf.read_parquet(tmp_path, filters=[("y", "==", "c")]) + b = dask_cudf.read_parquet( + tmp_path, filters=[("y", "==", "c")], split_row_groups=True + ) assert b.npartitions == 1 b = b.compute().to_pandas() assert (b.y == "c").all() c = dask_cudf.read_parquet( - tmp_path, filters=[("y", "==", "c"), ("x", ">", 6)] + tmp_path, + filters=[("y", "==", "c"), ("x", ">", 6)], + split_row_groups=True, ) assert c.npartitions <= 1 assert not len(c) @@ -237,13 +256,17 @@ def test_filters_at_row_group_level(tmpdir): ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=10 / 5) - a = dask_cudf.read_parquet(tmp_path, filters=[("x", "==", 1)]) + a = dask_cudf.read_parquet( + tmp_path, filters=[("x", "==", 1)], split_row_groups=True + ) assert a.npartitions == 1 assert (a.shape[0] == 2).compute() ddf.to_parquet(tmp_path, engine="pyarrow", row_group_size=1) - b = dask_cudf.read_parquet(tmp_path, filters=[("x", "==", 1)]) + b = dask_cudf.read_parquet( + tmp_path, filters=[("x", "==", 1)], split_row_groups=True + ) assert b.npartitions == 1 assert (b.shape[0] == 1).compute() @@ -341,7 +364,7 @@ def test_chunksize(tmpdir, chunksize, metadata): path, chunksize=chunksize, split_row_groups=True, - gather_statistics=True, + **_divisions(True), ) ddf2.compute(scheduler="synchronous") @@ -360,8 +383,8 @@ def test_chunksize(tmpdir, chunksize, metadata): path, chunksize=chunksize, split_row_groups=True, - gather_statistics=True, aggregate_files=True, + **_divisions(True), ) dd.assert_eq(ddf1, ddf3, check_divisions=False) @@ -382,7 +405,7 @@ def test_chunksize(tmpdir, chunksize, metadata): @pytest.mark.parametrize("row_groups", [1, 3, 10, 12]) @pytest.mark.parametrize("index", [False, True]) -def test_row_groups_per_part(tmpdir, row_groups, index): +def test_split_row_groups(tmpdir, row_groups, index): nparts = 2 df_size = 100 row_group_size = 5 @@ -410,7 +433,7 @@ def test_row_groups_per_part(tmpdir, row_groups, index): ddf2 = dask_cudf.read_parquet( str(tmpdir), - row_groups_per_part=row_groups, + split_row_groups=row_groups, ) dd.assert_eq(ddf1, ddf2, check_divisions=False) @@ -448,9 +471,9 @@ def test_create_metadata_file(tmpdir, partition_on): # with the _metadata file present ddf2 = dask_cudf.read_parquet( tmpdir, - gather_statistics=True, split_row_groups=False, index="myindex", + **_divisions(True), ) if partition_on: ddf1 = df1.sort_values("b") @@ -481,7 +504,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir): # New pyarrow-dataset base can handle an inconsistent # schema (even without a _metadata file), but computing # and dtype validation may fail - ddf1 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True) + ddf1 = dask_cudf.read_parquet(str(tmpdir), **_divisions(True)) # Add global metadata file. # Dask-CuDF can do this without requiring schema @@ -490,7 +513,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir): # Check that we can still read the ddf # with the _metadata file present - ddf2 = dask_cudf.read_parquet(str(tmpdir), gather_statistics=True) + ddf2 = dask_cudf.read_parquet(str(tmpdir), **_divisions(True)) # Check that the result is the same with and # without the _metadata file. Note that we must @@ -538,3 +561,12 @@ def test_cudf_list_struct_write(tmpdir): ddf.to_parquet(temp_file) new_ddf = dask_cudf.read_parquet(temp_file) dd.assert_eq(df, new_ddf) + + +def test_check_file_size(tmpdir): + # Test simple file-size check to help warn users + # of upstream change to `split_row_groups` default + fn = str(tmpdir.join("test.parquet")) + cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn) + with pytest.warns(match="large parquet file"): + dask_cudf.read_parquet(fn, check_file_size=1).compute() From a43fb9eafb15b50bf5de21ac0bdebd3b490f511e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 28 Apr 2022 11:04:12 -0400 Subject: [PATCH 06/28] Implement DataFrame.eval using libcudf ASTs (#8022) This PR exposes `libcudf`'s expression parsing functionality in `cudf` and uses it to implement `DataFrame.eval`. The implementation is mostly feature-complete, but there are a few limitations relative to the `pandas` API and a couple of gotchas around type casting. The implementation is reasonably performant, improving upon an equivalent `df.apply` even accounting for JIT-compilation overhead. This implementation provides a stepping stone to leveraging `libcudf`'s AST implementation for more complex tasks in `cudf` such as conditional joins. The most significant issue with the current implementation is the lack of casting between integral types, meaning that operations can only be performed between columns of the _exact_ same dtype. For example, operations between int8 and int16 would fail. This becomes particularly problematic for constants e.g. `df.eval('x+1')`. The best paths to improve this are at the C++ level of the expression evaluation, so I think we'll have to live with this limitation for now if we want to move forward. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Ashwin Srinath (https://github.com/shwina) - Bradley Dice (https://github.com/bdice) - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/8022 --- cpp/include/cudf/ast/expressions.hpp | 6 +- python/cudf/cudf/_lib/__init__.py | 3 +- python/cudf/cudf/_lib/cpp/expressions.pxd | 88 +++++++ python/cudf/cudf/_lib/cpp/transform.pxd | 8 +- python/cudf/cudf/_lib/expressions.pxd | 38 +++ python/cudf/cudf/_lib/expressions.pyx | 130 ++++++++++ python/cudf/cudf/_lib/transform.pyx | 37 +++ .../cudf/cudf/core/_internals/expressions.py | 222 ++++++++++++++++++ python/cudf/cudf/core/dataframe.py | 160 +++++++++++++ python/cudf/cudf/tests/test_dataframe.py | 87 +++++++ 10 files changed, 775 insertions(+), 4 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/expressions.pxd create mode 100644 python/cudf/cudf/_lib/expressions.pxd create mode 100644 python/cudf/cudf/_lib/expressions.pyx create mode 100644 python/cudf/cudf/core/_internals/expressions.py diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp index eb98e0e0bee..96c99e054a5 100644 --- a/cpp/include/cudf/ast/expressions.hpp +++ b/cpp/include/cudf/ast/expressions.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,8 @@ #include #include +#include + namespace cudf { namespace ast { @@ -53,7 +55,7 @@ struct expression { /** * @brief Enum of supported operators. */ -enum class ast_operator { +enum class ast_operator : int32_t { // Binary operators ADD, ///< operator + SUB, ///< operator - diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index bd25aa53405..542262b7908 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import numpy as np from . import ( @@ -8,6 +8,7 @@ copying, csv, datetime, + expressions, filling, gpuarrow, groupby, diff --git a/python/cudf/cudf/_lib/cpp/expressions.pxd b/python/cudf/cudf/_lib/cpp/expressions.pxd new file mode 100644 index 00000000000..1721f8aa734 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/expressions.pxd @@ -0,0 +1,88 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.scalar.scalar cimport ( + duration_scalar, + numeric_scalar, + timestamp_scalar, +) +from cudf._lib.cpp.table.table_view cimport table_view +from cudf._lib.cpp.types cimport size_type + + +cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil: + ctypedef enum ast_operator: + # Binary operators + ADD "cudf::ast::ast_operator::ADD" + SUB "cudf::ast::ast_operator::SUB" + MUL "cudf::ast::ast_operator::MUL" + DIV "cudf::ast::ast_operator::DIV" + TRUE_DIV "cudf::ast::ast_operator::TRUE_DIV" + FLOOR_DIV "cudf::ast::ast_operator::FLOOR_DIV" + MOD "cudf::ast::ast_operator::MOD" + PYMOD "cudf::ast::ast_operator::PYMOD" + POW "cudf::ast::ast_operator::POW" + EQUAL "cudf::ast::ast_operator::EQUAL" + NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL" + NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL" + LESS "cudf::ast::ast_operator::LESS" + GREATER "cudf::ast::ast_operator::GREATER" + LESS_EQUAL "cudf::ast::ast_operator::LESS_EQUAL" + GREATER_EQUAL "cudf::ast::ast_operator::GREATER_EQUAL" + BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND" + BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR" + BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR" + NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND" + LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND" + NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR" + LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR" + # Unary operators + IDENTITY "cudf::ast::ast_operator::IDENTITY" + SIN "cudf::ast::ast_operator::SIN" + COS "cudf::ast::ast_operator::COS" + TAN "cudf::ast::ast_operator::TAN" + ARCSIN "cudf::ast::ast_operator::ARCSIN" + ARCCOS "cudf::ast::ast_operator::ARCCOS" + ARCTAN "cudf::ast::ast_operator::ARCTAN" + SINH "cudf::ast::ast_operator::SINH" + COSH "cudf::ast::ast_operator::COSH" + TANH "cudf::ast::ast_operator::TANH" + ARCSINH "cudf::ast::ast_operator::ARCSINH" + ARCCOSH "cudf::ast::ast_operator::ARCCOSH" + ARCTANH "cudf::ast::ast_operator::ARCTANH" + EXP "cudf::ast::ast_operator::EXP" + LOG "cudf::ast::ast_operator::LOG" + SQRT "cudf::ast::ast_operator::SQRT" + CBRT "cudf::ast::ast_operator::CBRT" + CEIL "cudf::ast::ast_operator::CEIL" + FLOOR "cudf::ast::ast_operator::FLOOR" + ABS "cudf::ast::ast_operator::ABS" + RINT "cudf::ast::ast_operator::RINT" + BIT_INVERT "cudf::ast::ast_operator::BIT_INVERT" + NOT "cudf::ast::ast_operator::NOT" + + cdef cppclass expression: + pass + + ctypedef enum table_reference: + LEFT "cudf::ast::table_reference::LEFT" + RIGHT "cudf::ast::table_reference::RIGHT" + + cdef cppclass literal(expression): + # Due to https://github.com/cython/cython/issues/3198, we need to + # specify a return type for templated constructors. + literal literal[T](numeric_scalar[T] &) except + + literal literal[T](timestamp_scalar[T] &) except + + literal literal[T](duration_scalar[T] &) except + + + cdef cppclass column_reference(expression): + # Allow for default C++ parameters by declaring multiple constructors + # with the default parameters optionally omitted. + column_reference(size_type) except + + column_reference(size_type, table_reference) except + + + cdef cppclass operation(expression): + operation(ast_operator, const expression &) + operation(ast_operator, const expression &, const expression&) diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/cpp/transform.pxd index 590a371ff52..d9de04b676e 100644 --- a/python/cudf/cudf/_lib/cpp/transform.pxd +++ b/python/cudf/cudf/_lib/cpp/transform.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -9,6 +9,7 @@ from rmm._lib.device_buffer cimport device_buffer from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.expressions cimport expression from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type @@ -42,3 +43,8 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil: column_view input_column, column_view categories ) + + cdef unique_ptr[column] compute_column( + const table_view table, + const expression& expr + ) except + diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/expressions.pxd new file mode 100644 index 00000000000..85665822174 --- /dev/null +++ b/python/cudf/cudf/_lib/expressions.pxd @@ -0,0 +1,38 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from libc.stdint cimport int32_t, int64_t +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.expressions cimport ( + column_reference, + expression, + literal, + operation, +) +from cudf._lib.cpp.scalar.scalar cimport numeric_scalar + +ctypedef enum scalar_type_t: + INT + DOUBLE + + +ctypedef union int_or_double_scalar_ptr: + unique_ptr[numeric_scalar[int64_t]] int_ptr + unique_ptr[numeric_scalar[double]] double_ptr + + +cdef class Expression: + cdef unique_ptr[expression] c_obj + + +cdef class Literal(Expression): + cdef scalar_type_t c_scalar_type + cdef int_or_double_scalar_ptr c_scalar + + +cdef class ColumnReference(Expression): + pass + + +cdef class Operation(Expression): + pass diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx new file mode 100644 index 00000000000..f069bcdbe73 --- /dev/null +++ b/python/cudf/cudf/_lib/expressions.pyx @@ -0,0 +1,130 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +from enum import Enum + +from cython.operator cimport dereference +from libc.stdint cimport int64_t +from libcpp.memory cimport make_unique, unique_ptr +from libcpp.utility cimport move + +from cudf._lib.cpp cimport expressions as libcudf_exp +from cudf._lib.cpp.types cimport size_type + +# Necessary for proper casting, see below. +ctypedef int32_t underlying_type_ast_operator + + +# Aliases for simplicity +ctypedef unique_ptr[libcudf_exp.expression] expression_ptr + + +class ASTOperator(Enum): + ADD = libcudf_exp.ast_operator.ADD + SUB = libcudf_exp.ast_operator.SUB + MUL = libcudf_exp.ast_operator.MUL + DIV = libcudf_exp.ast_operator.DIV + TRUE_DIV = libcudf_exp.ast_operator.TRUE_DIV + FLOOR_DIV = libcudf_exp.ast_operator.FLOOR_DIV + MOD = libcudf_exp.ast_operator.MOD + PYMOD = libcudf_exp.ast_operator.PYMOD + POW = libcudf_exp.ast_operator.POW + EQUAL = libcudf_exp.ast_operator.EQUAL + NULL_EQUAL = libcudf_exp.ast_operator.NULL_EQUAL + NOT_EQUAL = libcudf_exp.ast_operator.NOT_EQUAL + LESS = libcudf_exp.ast_operator.LESS + GREATER = libcudf_exp.ast_operator.GREATER + LESS_EQUAL = libcudf_exp.ast_operator.LESS_EQUAL + GREATER_EQUAL = libcudf_exp.ast_operator.GREATER_EQUAL + BITWISE_AND = libcudf_exp.ast_operator.BITWISE_AND + BITWISE_OR = libcudf_exp.ast_operator.BITWISE_OR + BITWISE_XOR = libcudf_exp.ast_operator.BITWISE_XOR + LOGICAL_AND = libcudf_exp.ast_operator.LOGICAL_AND + NULL_LOGICAL_AND = libcudf_exp.ast_operator.NULL_LOGICAL_AND + LOGICAL_OR = libcudf_exp.ast_operator.LOGICAL_OR + NULL_LOGICAL_OR = libcudf_exp.ast_operator.NULL_LOGICAL_OR + # Unary operators + IDENTITY = libcudf_exp.ast_operator.IDENTITY + SIN = libcudf_exp.ast_operator.SIN + COS = libcudf_exp.ast_operator.COS + TAN = libcudf_exp.ast_operator.TAN + ARCSIN = libcudf_exp.ast_operator.ARCSIN + ARCCOS = libcudf_exp.ast_operator.ARCCOS + ARCTAN = libcudf_exp.ast_operator.ARCTAN + SINH = libcudf_exp.ast_operator.SINH + COSH = libcudf_exp.ast_operator.COSH + TANH = libcudf_exp.ast_operator.TANH + ARCSINH = libcudf_exp.ast_operator.ARCSINH + ARCCOSH = libcudf_exp.ast_operator.ARCCOSH + ARCTANH = libcudf_exp.ast_operator.ARCTANH + EXP = libcudf_exp.ast_operator.EXP + LOG = libcudf_exp.ast_operator.LOG + SQRT = libcudf_exp.ast_operator.SQRT + CBRT = libcudf_exp.ast_operator.CBRT + CEIL = libcudf_exp.ast_operator.CEIL + FLOOR = libcudf_exp.ast_operator.FLOOR + ABS = libcudf_exp.ast_operator.ABS + RINT = libcudf_exp.ast_operator.RINT + BIT_INVERT = libcudf_exp.ast_operator.BIT_INVERT + NOT = libcudf_exp.ast_operator.NOT + + +class TableReference(Enum): + LEFT = libcudf_exp.table_reference.LEFT + RIGHT = libcudf_exp.table_reference.RIGHT + + +# Note that this function only currently supports numeric literals. libcudf +# expressions don't really support other types yet though, so this isn't +# restrictive at the moment. +cdef class Literal(Expression): + def __cinit__(self, value): + # TODO: Would love to find a better solution than unions for literals. + cdef int intval + cdef double doubleval + + if isinstance(value, int): + self.c_scalar_type = scalar_type_t.INT + intval = value + self.c_scalar.int_ptr = make_unique[numeric_scalar[int64_t]]( + intval, True + ) + self.c_obj = make_unique[libcudf_exp.literal]( + dereference(self.c_scalar.int_ptr) + ) + elif isinstance(value, float): + self.c_scalar_type = scalar_type_t.DOUBLE + doubleval = value + self.c_scalar.double_ptr = make_unique[numeric_scalar[double]]( + doubleval, True + ) + self.c_obj = make_unique[libcudf_exp.literal]( + dereference(self.c_scalar.double_ptr) + ) + + +cdef class ColumnReference(Expression): + def __cinit__(self, size_type index): + self.c_obj = make_unique[libcudf_exp.column_reference]( + index + ) + + +cdef class Operation(Expression): + def __cinit__(self, op, Expression left, Expression right=None): + # This awkward double casting is the only way to get Cython to generate + # valid C++. Cython doesn't support scoped enumerations, so it assumes + # that enums correspond to their underlying value types and will thus + # attempt operations that are invalid without first explicitly casting + # to the underlying before casting to the desired type. + cdef libcudf_exp.ast_operator op_value = ( + op.value + ) + + if right is None: + self.c_obj = make_unique[libcudf_exp.operation]( + op_value, dereference(left.c_obj) + ) + else: + self.c_obj = make_unique[libcudf_exp.operation]( + op_value, dereference(left.c_obj), dereference(right.c_obj) + ) diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx index 175150b6865..2d94ef2cedf 100644 --- a/python/cudf/cudf/_lib/transform.pyx +++ b/python/cudf/cudf/_lib/transform.pyx @@ -5,9 +5,11 @@ from numba.np import numpy_support import cudf from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES +from cudf.core._internals.expressions import parse_expression from cudf.core.buffer import Buffer from cudf.utils import cudautils +from cython.operator cimport dereference from libc.stdint cimport uintptr_t from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair @@ -20,14 +22,18 @@ cimport cudf._lib.cpp.transform as libcudf_transform from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.expressions cimport expression from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id +from cudf._lib.expressions cimport Expression from cudf._lib.types cimport underlying_type_t_type_id from cudf._lib.utils cimport ( columns_from_unique_ptr, data_from_table_view, + data_from_unique_ptr, table_view_from_columns, + table_view_from_table, ) @@ -156,3 +162,34 @@ def one_hot_encode(Column input_column, Column categories): ) return encodings + + +def compute_column(list columns, tuple column_names, expr: str): + """Compute a new column by evaluating an expression on a set of columns. + + Parameters + ---------- + columns : list + The set of columns forming the table to evaluate the expression on. + column_names : tuple[str] + The names associated with each column. These names are necessary to map + column names in the expression to indices in the provided list of + columns, which are what will be used by libcudf to evaluate the + expression on the table. + expr : str + The expression to evaluate. + """ + visitor = parse_expression(expr, column_names) + + # At the end, all the stack contains is the expression to evaluate. + cdef Expression cudf_expr = visitor.expression + cdef table_view tbl = table_view_from_columns(columns) + cdef unique_ptr[column] col + with nogil: + col = move( + libcudf_transform.compute_column( + tbl, + dereference(cudf_expr.c_obj.get()) + ) + ) + return Column.from_unique_ptr(move(col)) diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py new file mode 100644 index 00000000000..bc587d4e1e2 --- /dev/null +++ b/python/cudf/cudf/core/_internals/expressions.py @@ -0,0 +1,222 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. + +import ast +import functools +from typing import List, Tuple + +from cudf._lib.expressions import ( + ASTOperator, + ColumnReference, + Expression, + Literal, + Operation, +) + +# This dictionary encodes the mapping from Python AST operators to their cudf +# counterparts. +python_cudf_operator_map = { + # Binary operators + ast.Add: ASTOperator.ADD, + ast.Sub: ASTOperator.SUB, + ast.Mult: ASTOperator.MUL, + ast.Div: ASTOperator.DIV, + ast.FloorDiv: ASTOperator.FLOOR_DIV, + ast.Mod: ASTOperator.PYMOD, + ast.Pow: ASTOperator.POW, + ast.Eq: ASTOperator.EQUAL, + ast.NotEq: ASTOperator.NOT_EQUAL, + ast.Lt: ASTOperator.LESS, + ast.Gt: ASTOperator.GREATER, + ast.LtE: ASTOperator.LESS_EQUAL, + ast.GtE: ASTOperator.GREATER_EQUAL, + ast.BitXor: ASTOperator.BITWISE_XOR, + # TODO: The mapping of logical/bitwise operators here is inconsistent with + # pandas. In pandas, Both `BitAnd` and `And` map to + # `ASTOperator.LOGICAL_AND` for booleans, while they map to + # `ASTOperator.BITWISE_AND` for integers. However, there is no good way to + # encode this at present because expressions can be arbitrarily nested so + # we won't know the dtype of the input without inserting a much more + # complex traversal of the expression tree to determine the output types at + # each node. For now, we'll rely on users to use the appropriate operator. + ast.BitAnd: ASTOperator.BITWISE_AND, + ast.BitOr: ASTOperator.BITWISE_OR, + ast.And: ASTOperator.LOGICAL_AND, + ast.Or: ASTOperator.LOGICAL_OR, + # Unary operators + ast.Invert: ASTOperator.BIT_INVERT, + ast.Not: ASTOperator.NOT, + # TODO: Missing USub, possibility other unary ops? +} + + +# Mapping between Python function names encode in an ast.Call node and the +# corresponding libcudf C++ AST operators. +python_cudf_function_map = { + # TODO: Operators listed on + # https://pandas.pydata.org/pandas-docs/stable/user_guide/enhancingperf.html#expression-evaluation-via-eval # noqa: E501 + # that we don't support yet: + # expm1, log1p, arctan2 and log10. + "sin": ASTOperator.SIN, + "cos": ASTOperator.COS, + "tan": ASTOperator.TAN, + "arcsin": ASTOperator.ARCSIN, + "arccos": ASTOperator.ARCCOS, + "arctan": ASTOperator.ARCTAN, + "sinh": ASTOperator.SINH, + "cosh": ASTOperator.COSH, + "tanh": ASTOperator.TANH, + "arcsinh": ASTOperator.ARCSINH, + "arccosh": ASTOperator.ARCCOSH, + "arctanh": ASTOperator.ARCTANH, + "exp": ASTOperator.EXP, + "log": ASTOperator.LOG, + "sqrt": ASTOperator.SQRT, + "abs": ASTOperator.ABS, + "ceil": ASTOperator.CEIL, + "floor": ASTOperator.FLOOR, + # TODO: Operators supported by libcudf with no Python function analog. + # ast.rint: ASTOperator.RINT, + # ast.cbrt: ASTOperator.CBRT, +} + + +class libcudfASTVisitor(ast.NodeVisitor): + """A NodeVisitor specialized for constructing a libcudf expression tree. + + This visitor is designed to handle AST nodes that have libcudf equivalents. + It constructs column references from names and literals from constants, + then builds up operations. The final result can be accessed using the + `expression` property. The visitor must be kept in scope for as long as the + expression is needed because all of the underlying libcudf expressions will + be destroyed when the libcudfASTVisitor is. + + Parameters + ---------- + col_names : Tuple[str] + The column names used to map the names in an expression. + """ + + def __init__(self, col_names: Tuple[str]): + self.stack: List[Expression] = [] + self.nodes: List[Expression] = [] + self.col_names = col_names + + @property + def expression(self): + """Expression: The result of parsing an AST.""" + assert len(self.stack) == 1 + return self.stack[-1] + + def visit_Name(self, node): + try: + col_id = self.col_names.index(node.id) + except ValueError: + raise ValueError(f"Unknown column name {node.id}") + self.stack.append(ColumnReference(col_id)) + + def visit_Constant(self, node): + if not isinstance(node, ast.Num): + raise ValueError( + f"Unsupported literal {repr(node.value)} of type " + "{type(node.value).__name__}" + ) + self.stack.append(Literal(node.value)) + + def visit_UnaryOp(self, node): + self.visit(node.operand) + self.nodes.append(self.stack.pop()) + if isinstance(node.op, ast.USub): + # TODO: Except for leaf nodes, we won't know the type of the + # operand, so there's no way to know whether this should be a float + # or an int. We should maybe see what Spark does, and this will + # probably require casting. + self.nodes.append(Literal(-1)) + op = ASTOperator.MUL + self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) + elif isinstance(node.op, ast.UAdd): + self.stack.append(self.nodes[-1]) + else: + op = python_cudf_operator_map[type(node.op)] + self.stack.append(Operation(op, self.nodes[-1])) + + def visit_BinOp(self, node): + self.visit(node.left) + self.visit(node.right) + self.nodes.append(self.stack.pop()) + self.nodes.append(self.stack.pop()) + + op = python_cudf_operator_map[type(node.op)] + self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2])) + + def _visit_BoolOp_Compare(self, operators, operands, has_multiple_ops): + # Helper function handling the common components of parsing BoolOp and + # Compare AST nodes. These two types of nodes both support chaining + # (e.g. `a > b > c` is equivalent to `a > b and b > c`, so this + # function helps standardize that. + + # TODO: Whether And/Or and BitAnd/BitOr actually correspond to + # logical or bitwise operators depends on the data types that they + # are applied to. We'll need to add logic to map to that. + inner_ops = [] + for op, (left, right) in zip(operators, operands): + # Note that this will lead to duplicate nodes, e.g. if + # the comparison is `a < b < c` that will be encoded as + # `a < b and b < c`. We could potentially optimize by caching + # expressions by name so that we only construct them once. + self.visit(left) + self.visit(right) + + self.nodes.append(self.stack.pop()) + self.nodes.append(self.stack.pop()) + + op = python_cudf_operator_map[type(op)] + inner_ops.append(Operation(op, self.nodes[-1], self.nodes[-2])) + + self.nodes.extend(inner_ops) + + # If we have more than one comparator, we need to link them + # together with LOGICAL_AND operators. + if has_multiple_ops: + op = ASTOperator.LOGICAL_AND + + def _combine_compare_ops(left, right): + self.nodes.append(Operation(op, left, right)) + return self.nodes[-1] + + functools.reduce(_combine_compare_ops, inner_ops) + + self.stack.append(self.nodes[-1]) + + def visit_BoolOp(self, node): + operators = [node.op] * (len(node.values) - 1) + operands = zip(node.values[:-1], node.values[1:]) + self._visit_BoolOp_Compare(operators, operands, len(node.values) > 2) + + def visit_Compare(self, node): + operands = (node.left, *node.comparators) + has_multiple_ops = len(operands) > 2 + operands = zip(operands[:-1], operands[1:]) + self._visit_BoolOp_Compare(node.ops, operands, has_multiple_ops) + + def visit_Call(self, node): + try: + op = python_cudf_function_map[node.func.id] + except KeyError: + raise ValueError(f"Unsupported function {node.func}.") + # Assuming only unary functions are supported, which is checked above. + if len(node.args) != 1 or node.keywords: + raise ValueError( + f"Function {node.func} only accepts one positional " + "argument." + ) + self.visit(node.args[0]) + + self.nodes.append(self.stack.pop()) + self.stack.append(Operation(op, self.nodes[-1])) + + +@functools.lru_cache(256) +def parse_expression(expr: str, col_names: Tuple[str]): + visitor = libcudfASTVisitor(col_names) + visitor.visit(ast.parse(expr)) + return visitor diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7b4b81630bd..0d3b3ee0300 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7,6 +7,7 @@ import itertools import numbers import pickle +import re import sys import warnings from collections import abc, defaultdict @@ -6253,6 +6254,165 @@ def interleave_columns(self): {None: libcudf.reshape.interleave_columns([*self._columns])} ) + @_cudf_nvtx_annotate + def eval(self, expr: str, inplace: bool = False, **kwargs): + """Evaluate a string describing operations on DataFrame columns. + + Operates on columns only, not specific rows or elements. + + Parameters + ---------- + expr : str + The expression string to evaluate. + inplace : bool, default False + If the expression contains an assignment, whether to perform the + operation inplace and mutate the existing DataFrame. Otherwise, + a new DataFrame is returned. + **kwargs + Not supported. + + Returns + ------- + DataFrame, Series, or None + Series if a single column is returned (the typical use case), + DataFrame if any assignment statements are included in + ``expr``, or None if ``inplace=True``. + + Notes + ----- + Difference from pandas: + * Additional kwargs are not supported. + * Bitwise and logical operators are not dtype-dependent. + Specifically, `&` must be used for bitwise operators on integers, + not `and`, which is specifically for the logical and between + booleans. + * Only numerical types are currently supported. + * Operators generally will not cast automatically. Users are + responsible for casting columns to suitable types before + evaluating a function. + * Multiple assignments to the same name (i.e. a sequence of + assignment statements where later statements are conditioned upon + the output of earlier statements) is not supported. + + Examples + -------- + >>> df = cudf.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + >>> df.eval('A + B') + 0 11 + 1 10 + 2 9 + 3 8 + 4 7 + dtype: int64 + + Assignment is allowed though by default the original DataFrame is not + modified. + + >>> df.eval('C = A + B') + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + >>> df + A B + 0 1 10 + 1 2 8 + 2 3 6 + 3 4 4 + 4 5 2 + + Use ``inplace=True`` to modify the original DataFrame. + + >>> df.eval('C = A + B', inplace=True) + >>> df + A B C + 0 1 10 11 + 1 2 8 10 + 2 3 6 9 + 3 4 4 8 + 4 5 2 7 + + Multiple columns can be assigned to using multi-line expressions: + + >>> df.eval( + ... ''' + ... C = A + B + ... D = A - B + ... ''' + ... ) + A B C D + 0 1 10 11 -9 + 1 2 8 10 -6 + 2 3 6 9 -3 + 3 4 4 8 0 + 4 5 2 7 3 + """ + if kwargs: + raise ValueError( + "Keyword arguments other than `inplace` are not supported" + ) + + # Have to use a regex match to avoid capturing "==" + includes_assignment = re.search("[^=]=[^=]", expr) is not None + + # Check if there were multiple statements. Filter out empty lines. + statements = tuple(filter(None, expr.strip().split("\n"))) + if len(statements) > 1 and any( + re.search("[^=]=[^=]", st) is None for st in statements + ): + raise ValueError( + "Multi-line expressions are only valid if all expressions " + "contain an assignment." + ) + + if not includes_assignment: + if inplace: + raise ValueError( + "Cannot operate inplace if there is no assignment" + ) + return Series._from_data( + { + None: libcudf.transform.compute_column( + [*self._columns], self._column_names, statements[0] + ) + } + ) + + targets = [] + exprs = [] + for st in statements: + try: + t, e = re.split("[^=]=[^=]", st) + except ValueError as err: + if "too many values" in str(err): + raise ValueError( + f"Statement {st} contains too many assignments ('=')" + ) + raise + targets.append(t.strip()) + exprs.append(e.strip()) + + cols = ( + libcudf.transform.compute_column( + [*self._columns], self._column_names, e + ) + for e in exprs + ) + ret = self if inplace else self.copy(deep=False) + for name, col in zip(targets, cols): + ret._data[name] = col + if not inplace: + return ret + def from_dataframe(df, allow_copy=False): return df_protocol.from_dataframe(df, allow_copy=allow_copy) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index bf5c4ae319b..d95fe278469 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9266,3 +9266,90 @@ def test_empty_numeric_only(data): expected = pdf.prod(numeric_only=True) actual = gdf.prod(numeric_only=True) assert_eq(expected, actual) + + +@pytest.fixture +def df_eval(): + N = 10 + int_max = 10 + rng = cupy.random.default_rng(0) + return cudf.DataFrame( + { + "a": rng.integers(N, size=int_max), + "b": rng.integers(N, size=int_max), + "c": rng.integers(N, size=int_max), + "d": rng.integers(N, size=int_max), + } + ) + + +# Note that for now expressions do not automatically handle casting, so inputs +# need to be casted appropriately +@pytest.mark.parametrize( + "expr, dtype", + [ + ("a", int), + ("+a", int), + ("a + b", int), + ("a == b", int), + ("a / b", float), + ("a * b", int), + ("a > b", int), + ("a > b > c", int), + ("a > b < c", int), + ("a & b", int), + ("a & b | c", int), + ("sin(a)", float), + ("exp(sin(abs(a)))", float), + ("sqrt(floor(a))", float), + ("ceil(arctanh(a))", float), + ("(a + b) - (c * d)", int), + ("~a", int), + ("(a > b) and (c > d)", int), + ("(a > b) or (c > d)", int), + ("not (a > b)", int), + ("a + 1", int), + ("a + 1.0", float), + ("-a + 1", int), + ("+a + 1", int), + ("e = a + 1", int), + ( + """ + e = log(cos(a)) + 1.0 + f = abs(c) - exp(d) + """, + float, + ), + ("a_b_are_equal = (a == b)", int), + ], +) +def test_dataframe_eval(df_eval, expr, dtype): + df_eval = df_eval.astype(dtype) + expect = df_eval.to_pandas().eval(expr) + got = df_eval.eval(expr) + # In the specific case where the evaluated expression is a unary function + # of a single column with no nesting, pandas will retain the name. This + # level of compatibility is out of scope for now. + assert_eq(expect, got, check_names=False) + + # Test inplace + if re.search("[^=]=[^=]", expr) is not None: + pdf_eval = df_eval.to_pandas() + pdf_eval.eval(expr, inplace=True) + df_eval.eval(expr, inplace=True) + assert_eq(pdf_eval, df_eval) + + +@pytest.mark.parametrize( + "expr", + [ + """ + e = a + b + a == b + """, + "a_b_are_equal = (a == b) = c", + ], +) +def test_dataframe_eval_errors(df_eval, expr): + with pytest.raises(ValueError): + df_eval.eval(expr) From 20569f6cd9e03f1d7536ac49e6e93ffc99941e98 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 28 Apr 2022 11:38:06 -0400 Subject: [PATCH 07/28] Add `detail::hash_join` (#10695) Closes https://github.com/rapidsai/cudf/issues/10587 This PR adds a `detail::hash_join` class which is templated on the hash function. It also cleans up `join` internal functions by moving code around to proper files. The implementation of `detail::hash_join` is mainly taken from `cudf::hash_join::hash_join_impl`. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/10695 --- cpp/include/cudf/detail/join.hpp | 185 +++++++++++ .../cudf/detail/utilities/hash_functions.cuh | 3 - cpp/include/cudf/hashing.hpp | 3 + cpp/include/cudf/join.hpp | 17 +- cpp/src/join/hash_join.cu | 267 +++++++++++----- cpp/src/join/hash_join.cuh | 289 ------------------ cpp/src/join/join.cu | 85 ++---- cpp/src/join/join_common_utils.cuh | 82 ++++- cpp/src/join/join_common_utils.hpp | 13 +- cpp/src/join/join_utils.cu | 2 +- cpp/src/join/mixed_join.cu | 9 +- cpp/src/join/mixed_join_kernel.cuh | 7 +- cpp/src/join/mixed_join_semi.cu | 11 +- cpp/src/join/mixed_join_size_kernel.cuh | 7 +- cpp/src/join/semi_join.cu | 4 +- 15 files changed, 516 insertions(+), 468 deletions(-) create mode 100644 cpp/include/cudf/detail/join.hpp delete mode 100644 cpp/src/join/hash_join.cuh diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp new file mode 100644 index 00000000000..12e4aaa03fd --- /dev/null +++ b/cpp/include/cudf/detail/join.hpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include + +// Forward declaration +template +class default_allocator; + +namespace cudf { +namespace detail { + +constexpr int DEFAULT_JOIN_CG_SIZE = 2; + +enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN }; + +/** + * @brief Hash join that builds hash table in creation and probes results in subsequent `*_join` + * member functions. + * + * User-defined hash function can be passed via the template parameter `Hasher` + * + * @tparam Hasher Unary callable type + */ +template +struct hash_join { + public: + using map_type = + cuco::static_multimap>, + cuco::double_hashing>; + + hash_join() = delete; + ~hash_join() = default; + hash_join(hash_join const&) = delete; + hash_join(hash_join&&) = delete; + hash_join& operator=(hash_join const&) = delete; + hash_join& operator=(hash_join&&) = delete; + + private: + bool const _is_empty; ///< true if `_hash_table` is empty + cudf::null_equality const _nulls_equal; ///< whether to consider nulls as equal + cudf::table_view _build; ///< input table to build the hash map + cudf::structs::detail::flattened_table + _flattened_build_table; ///< flattened data structures for `_build` + map_type _hash_table; ///< hash table built on `_build` + + public: + /** + * @brief Constructor that internally builds the hash table based on the given `build` table. + * + * @throw cudf::logic_error if the number of columns in `build` table is 0. + * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE. + * + * @param build The build table, from which the hash table is built. + * @param compare_nulls Controls whether null join-key values should match or not. + * @param stream CUDA stream used for device memory operations and kernel launches. + */ + hash_join(cudf::table_view const& build, + cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + + /** + * @copydoc cudf::hash_join::inner_join + */ + std::pair>, + std::unique_ptr>> + inner_join(cudf::table_view const& probe, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + + /** + * @copydoc cudf::hash_join::left_join + */ + std::pair>, + std::unique_ptr>> + left_join(cudf::table_view const& probe, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + + /** + * @copydoc cudf::hash_join::full_join + */ + std::pair>, + std::unique_ptr>> + full_join(cudf::table_view const& probe, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + + /** + * @copydoc cudf::hash_join::inner_join_size + */ + [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe, + rmm::cuda_stream_view stream) const; + + /** + * @copydoc cudf::hash_join::left_join_size + */ + [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe, + rmm::cuda_stream_view stream) const; + + /** + * @copydoc cudf::hash_join::full_join_size + */ + std::size_t full_join_size(cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + + private: + /** + * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`, + * and returns the output indices of `build_table` and `probe_table` as a combined table, + * i.e. if full join is specified as the join type then left join is called. Behavior + * is undefined if the provided `output_size` is smaller than the actual output size. + * + * @throw cudf::logic_error if build table is empty and `JoinKind == INNER_JOIN`. + * + * @tparam JoinKind The type of join to be performed. + * + * @param probe_table Table of probe side columns to join. + * @param output_size Optional value which allows users to specify the exact output size. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned vectors. + * + * @return Join output indices vector pair. + */ + template + std::pair>, + std::unique_ptr>> + probe_join_indices(cudf::table_view const& probe_table, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; + + /** + * @copydoc cudf::detail::hash_join::probe_join_indices + * + * @throw cudf::logic_error if probe table is empty. + * @throw cudf::logic_error if the size of probe table exceeds `MAX_JOIN_SIZE`. + * @throw cudf::logic_error if the number of columns in build table and probe table do not match. + * @throw cudf::logic_error if the column data types in build table and probe table do not match. + */ + template + std::pair>, + std::unique_ptr>> + compute_hash_join(cudf::table_view const& probe, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const; +}; +} // namespace detail +} // namespace cudf diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh index 9c6f3e9cb13..2c5434b63d2 100644 --- a/cpp/include/cudf/detail/utilities/hash_functions.cuh +++ b/cpp/include/cudf/detail/utilities/hash_functions.cuh @@ -32,9 +32,6 @@ #include namespace cudf { - -using hash_value_type = uint32_t; - namespace detail { /** diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index e973c585410..bbff304e547 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -19,6 +19,9 @@ #include namespace cudf { + +using hash_value_type = uint32_t; + /** * @addtogroup column_hash * @{ diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index d56f8f0e904..f48f8a83e9a 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include #include @@ -29,6 +30,16 @@ #include namespace cudf { + +// forward declaration +namespace detail { +template +class MurmurHash3_32; + +template +class hash_join; +} // namespace detail + /** * @addtogroup column_join * @{ @@ -503,6 +514,9 @@ std::unique_ptr cross_join( */ class hash_join { public: + using impl_type = + typename cudf::detail::hash_join>; + hash_join() = delete; ~hash_join(); hash_join(hash_join const&) = delete; @@ -634,8 +648,7 @@ class hash_join { rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const; private: - struct hash_join_impl; - const std::unique_ptr impl; + const std::unique_ptr _impl; }; /** diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 8d2888fd761..3e0e76de708 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -13,11 +13,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include +#include "join_common_utils.cuh" #include #include +#include +#include #include +#include #include #include @@ -38,13 +41,67 @@ namespace cudf { namespace detail { - -std::pair, std::unique_ptr
> get_empty_joined_table( - table_view const& probe, table_view const& build) +namespace { +/** + * @brief Calculates the exact size of the join output produced when + * joining two tables together. + * + * @throw cudf::logic_error if JoinKind is not INNER_JOIN or LEFT_JOIN + * + * @tparam JoinKind The type of join to be performed + * + * @param build_table The right hand table + * @param probe_table The left hand table + * @param hash_table A hash table built on the build table that maps the index + * of every row to the hash value of that row. + * @param nulls_equal Flag to denote nulls are equal or not. + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return The exact size of the output of the join operation + */ +template +std::size_t compute_join_output_size(table_device_view build_table, + table_device_view probe_table, + cudf::detail::multimap_type const& hash_table, + bool const has_nulls, + cudf::null_equality const nulls_equal, + rmm::cuda_stream_view stream) { - std::unique_ptr
empty_probe = empty_like(probe); - std::unique_ptr
empty_build = empty_like(build); - return std::pair(std::move(empty_probe), std::move(empty_build)); + const size_type build_table_num_rows{build_table.num_rows()}; + const size_type probe_table_num_rows{probe_table.num_rows()}; + + // If the build table is empty, we know exactly how large the output + // will be for the different types of joins and can return immediately + if (0 == build_table_num_rows) { + switch (JoinKind) { + // Inner join with an empty table will have no output + case join_kind::INNER_JOIN: return 0; + + // Left join with an empty table will have an output of NULL rows + // equal to the number of rows in the probe table + case join_kind::LEFT_JOIN: return probe_table_num_rows; + + default: CUDF_FAIL("Unsupported join type"); + } + } + + auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls}; + pair_equality equality{probe_table, build_table, probe_nulls, nulls_equal}; + + row_hash hash_probe{probe_nulls, probe_table}; + auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); + make_pair_function pair_func{hash_probe, empty_key_sentinel}; + + auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); + + std::size_t size; + if constexpr (JoinKind == join_kind::LEFT_JOIN) { + size = hash_table.pair_count_outer(iter, iter + probe_table_num_rows, equality, stream.value()); + } else { + size = hash_table.pair_count(iter, iter + probe_table_num_rows, equality, stream.value()); + } + + return size; } /** @@ -69,7 +126,7 @@ std::pair>, std::unique_ptr>> probe_join_hash_table(cudf::table_device_view build_table, cudf::table_device_view probe_table, - multimap_type const& hash_table, + cudf::detail::multimap_type const& hash_table, bool has_nulls, null_equality compare_nulls, std::optional output_size, @@ -145,7 +202,7 @@ probe_join_hash_table(cudf::table_device_view build_table, */ std::size_t get_full_join_size(cudf::table_device_view build_table, cudf::table_device_view probe_table, - multimap_type const& hash_table, + cudf::detail::multimap_type const& hash_table, bool const has_nulls, null_equality const compare_nulls, rmm::cuda_stream_view stream, @@ -157,8 +214,6 @@ std::size_t get_full_join_size(cudf::table_device_view build_table, // If output size is zero, return immediately if (join_size == 0) { return join_size; } - rmm::device_scalar write_index(0, stream); - auto left_indices = std::make_unique>(join_size, stream, mr); auto right_indices = std::make_unique>(join_size, stream, mr); @@ -221,25 +276,12 @@ std::size_t get_full_join_size(cudf::table_device_view build_table, } return join_size + left_join_complement_size; } +} // namespace -std::unique_ptr combine_table_pair(std::unique_ptr&& left, - std::unique_ptr&& right) -{ - auto joined_cols = left->release(); - auto right_cols = right->release(); - joined_cols.insert(joined_cols.end(), - std::make_move_iterator(right_cols.begin()), - std::make_move_iterator(right_cols.end())); - return std::make_unique(std::move(joined_cols)); -} - -} // namespace detail - -hash_join::hash_join_impl::~hash_join_impl() = default; - -hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, - null_equality compare_nulls, - rmm::cuda_stream_view stream) +template +hash_join::hash_join(cudf::table_view const& build, + cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream) : _is_empty{build.num_rows() == 0}, _nulls_equal{compare_nulls}, _hash_table{compute_hash_table_size(build.num_rows()), @@ -263,41 +305,45 @@ hash_join::hash_join_impl::hash_join_impl(cudf::table_view const& build, cudf::detail::build_join_hash_table(_build, _hash_table, _nulls_equal, stream); } +template std::pair>, std::unique_ptr>> -hash_join::hash_join_impl::inner_join(cudf::table_view const& probe, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +hash_join::inner_join(cudf::table_view const& probe, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); return compute_hash_join(probe, output_size, stream, mr); } +template std::pair>, std::unique_ptr>> -hash_join::hash_join_impl::left_join(cudf::table_view const& probe, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +hash_join::left_join(cudf::table_view const& probe, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); return compute_hash_join(probe, output_size, stream, mr); } +template std::pair>, std::unique_ptr>> -hash_join::hash_join_impl::full_join(cudf::table_view const& probe, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +hash_join::full_join(cudf::table_view const& probe, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); return compute_hash_join(probe, output_size, stream, mr); } -std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& probe, - rmm::cuda_stream_view stream) const +template +std::size_t hash_join::inner_join_size(cudf::table_view const& probe, + rmm::cuda_stream_view stream) const { CUDF_FUNC_RANGE(); @@ -320,8 +366,9 @@ std::size_t hash_join::hash_join_impl::inner_join_size(cudf::table_view const& p stream); } -std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& probe, - rmm::cuda_stream_view stream) const +template +std::size_t hash_join::left_join_size(cudf::table_view const& probe, + rmm::cuda_stream_view stream) const { CUDF_FUNC_RANGE(); @@ -344,9 +391,10 @@ std::size_t hash_join::hash_join_impl::left_join_size(cudf::table_view const& pr stream); } -std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& probe, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +template +std::size_t hash_join::full_join_size(cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_FUNC_RANGE(); @@ -370,13 +418,51 @@ std::size_t hash_join::hash_join_impl::full_join_size(cudf::table_view const& pr mr); } +template +template +std::pair>, + std::unique_ptr>> +hash_join::probe_join_indices(cudf::table_view const& probe_table, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + // Trivial left join case - exit early + if (_is_empty and JoinKind != cudf::detail::join_kind::INNER_JOIN) { + return get_trivial_left_join_indices(probe_table, stream, mr); + } + + CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null."); + + auto build_table_ptr = cudf::table_device_view::create(_build, stream); + auto probe_table_ptr = cudf::table_device_view::create(probe_table, stream); + + auto join_indices = cudf::detail::probe_join_hash_table( + *build_table_ptr, + *probe_table_ptr, + _hash_table, + cudf::has_nulls(probe_table) | cudf::has_nulls(_build), + _nulls_equal, + output_size, + stream, + mr); + + if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) { + auto complement_indices = detail::get_left_join_indices_complement( + join_indices.second, probe_table.num_rows(), _build.num_rows(), stream, mr); + join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); + } + return join_indices; +} + +template template std::pair>, std::unique_ptr>> -hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +hash_join::compute_hash_join(cudf::table_view const& probe, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty"); CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE, @@ -403,41 +489,64 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const& probe, return probe_join_indices(flattened_probe_table, output_size, stream, mr); } +} // namespace detail + +hash_join::~hash_join() = default; + +hash_join::hash_join(cudf::table_view const& build, + null_equality compare_nulls, + rmm::cuda_stream_view stream) + : _impl{std::make_unique(build, compare_nulls, stream)} +{ +} -template std::pair>, std::unique_ptr>> -hash_join::hash_join_impl::probe_join_indices(cudf::table_view const& probe_table, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const +hash_join::inner_join(cudf::table_view const& probe, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const { - // Trivial left join case - exit early - if (_is_empty and JoinKind != cudf::detail::join_kind::INNER_JOIN) { - return get_trivial_left_join_indices(probe_table, stream, mr); - } + return _impl->inner_join(probe, output_size, stream, mr); +} - CUDF_EXPECTS(!_is_empty, "Hash table of hash join is null."); +std::pair>, + std::unique_ptr>> +hash_join::left_join(cudf::table_view const& probe, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + return _impl->left_join(probe, output_size, stream, mr); +} - auto build_table_ptr = cudf::table_device_view::create(_build, stream); - auto probe_table_ptr = cudf::table_device_view::create(probe_table, stream); +std::pair>, + std::unique_ptr>> +hash_join::full_join(cudf::table_view const& probe, + std::optional output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + return _impl->full_join(probe, output_size, stream, mr); +} - auto join_indices = cudf::detail::probe_join_hash_table( - *build_table_ptr, - *probe_table_ptr, - _hash_table, - cudf::has_nulls(probe_table) | cudf::has_nulls(_build), - _nulls_equal, - output_size, - stream, - mr); +std::size_t hash_join::inner_join_size(cudf::table_view const& probe, + rmm::cuda_stream_view stream) const +{ + return _impl->inner_join_size(probe, stream); +} - if constexpr (JoinKind == cudf::detail::join_kind::FULL_JOIN) { - auto complement_indices = detail::get_left_join_indices_complement( - join_indices.second, probe_table.num_rows(), _build.num_rows(), stream, mr); - join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream); - } - return join_indices; +std::size_t hash_join::left_join_size(cudf::table_view const& probe, + rmm::cuda_stream_view stream) const +{ + return _impl->left_join_size(probe, stream); +} + +std::size_t hash_join::full_join_size(cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const +{ + return _impl->full_join_size(probe, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh deleted file mode 100644 index e55de043372..00000000000 --- a/cpp/src/join/hash_join.cuh +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include -#include - -#include -#include - -namespace cudf { -namespace detail { - -/** - * @brief Remaps a hash value to a new value if it is equal to the specified sentinel value. - * - * @param hash The hash value to potentially remap - * @param sentinel The reserved value - */ -template -constexpr auto remap_sentinel_hash(H hash, S sentinel) -{ - // Arbitrarily choose hash - 1 - return (hash == sentinel) ? (hash - 1) : hash; -} - -/** - * @brief Device functor to create a pair of hash value and index for a given row. - */ -class make_pair_function { - public: - CUDF_HOST_DEVICE make_pair_function(row_hash const& hash, - hash_value_type const empty_key_sentinel) - : _hash{hash}, _empty_key_sentinel{empty_key_sentinel} - { - } - - __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept - { - // Compute the hash value of row `i` - auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel); - return cuco::make_pair(row_hash_value, i); - } - - private: - row_hash _hash; - hash_value_type const _empty_key_sentinel; -}; - -/** - * @brief Calculates the exact size of the join output produced when - * joining two tables together. - * - * @throw cudf::logic_error if JoinKind is not INNER_JOIN or LEFT_JOIN - * - * @tparam JoinKind The type of join to be performed - * @tparam multimap_type The type of the hash table - * - * @param build_table The right hand table - * @param probe_table The left hand table - * @param hash_table A hash table built on the build table that maps the index - * of every row to the hash value of that row. - * @param nulls_equal Flag to denote nulls are equal or not. - * @param stream CUDA stream used for device memory operations and kernel launches - * - * @return The exact size of the output of the join operation - */ -template -std::size_t compute_join_output_size(table_device_view build_table, - table_device_view probe_table, - multimap_type const& hash_table, - bool const has_nulls, - cudf::null_equality const nulls_equal, - rmm::cuda_stream_view stream) -{ - const size_type build_table_num_rows{build_table.num_rows()}; - const size_type probe_table_num_rows{probe_table.num_rows()}; - - // If the build table is empty, we know exactly how large the output - // will be for the different types of joins and can return immediately - if (0 == build_table_num_rows) { - switch (JoinKind) { - // Inner join with an empty table will have no output - case join_kind::INNER_JOIN: return 0; - - // Left join with an empty table will have an output of NULL rows - // equal to the number of rows in the probe table - case join_kind::LEFT_JOIN: return probe_table_num_rows; - - default: CUDF_FAIL("Unsupported join type"); - } - } - - auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls}; - pair_equality equality{probe_table, build_table, probe_nulls, nulls_equal}; - - row_hash hash_probe{probe_nulls, probe_table}; - auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); - make_pair_function pair_func{hash_probe, empty_key_sentinel}; - - auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); - - std::size_t size; - if constexpr (JoinKind == join_kind::LEFT_JOIN) { - size = hash_table.pair_count_outer(iter, iter + probe_table_num_rows, equality, stream.value()); - } else { - size = hash_table.pair_count(iter, iter + probe_table_num_rows, equality, stream.value()); - } - - return size; -} - -std::pair, std::unique_ptr
> get_empty_joined_table( - table_view const& probe, table_view const& build); - -std::unique_ptr combine_table_pair(std::unique_ptr&& left, - std::unique_ptr&& right); - -/** - * @brief Builds the hash table based on the given `build_table`. - * - * @tparam MultimapType The type of the hash table - * - * @param build Table of columns used to build join hash. - * @param hash_table Build hash table. - * @param nulls_equal Flag to denote nulls are equal or not. - * @param stream CUDA stream used for device memory operations and kernel launches. - * - */ -template -void build_join_hash_table(cudf::table_view const& build, - MultimapType& hash_table, - null_equality const nulls_equal, - rmm::cuda_stream_view stream) -{ - auto build_table_ptr = cudf::table_device_view::create(build, stream); - - CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty"); - CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows"); - - row_hash hash_build{nullate::DYNAMIC{cudf::has_nulls(build)}, *build_table_ptr}; - auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); - make_pair_function pair_func{hash_build, empty_key_sentinel}; - - auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); - - size_type const build_table_num_rows{build_table_ptr->num_rows()}; - if (nulls_equal == cudf::null_equality::EQUAL or (not nullable(build))) { - hash_table.insert(iter, iter + build_table_num_rows, stream.value()); - } else { - thrust::counting_iterator stencil(0); - auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first; - row_is_valid pred{static_cast(row_bitmask.data())}; - - // insert valid rows - hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value()); - } -} -} // namespace detail - -struct hash_join::hash_join_impl { - public: - hash_join_impl() = delete; - ~hash_join_impl(); - hash_join_impl(hash_join_impl const&) = delete; - hash_join_impl(hash_join_impl&&) = delete; - hash_join_impl& operator=(hash_join_impl const&) = delete; - hash_join_impl& operator=(hash_join_impl&&) = delete; - - private: - bool const _is_empty; - cudf::null_equality const _nulls_equal; - cudf::table_view _build; - std::vector> _created_null_columns; - cudf::structs::detail::flattened_table _flattened_build_table; - cudf::detail::multimap_type _hash_table; - - public: - /** - * @brief Constructor that internally builds the hash table based on the given `build` table - * - * @throw cudf::logic_error if the number of columns in `build` table is 0. - * @throw cudf::logic_error if the number of rows in `build` table exceeds MAX_JOIN_SIZE. - * - * @param build The build table, from which the hash table is built. - * @param compare_nulls Controls whether null join-key values should match or not. - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - hash_join_impl(cudf::table_view const& build, - null_equality compare_nulls, - rmm::cuda_stream_view stream = rmm::cuda_stream_default); - - std::pair>, - std::unique_ptr>> - inner_join(cudf::table_view const& probe, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - std::pair>, - std::unique_ptr>> - left_join(cudf::table_view const& probe, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - std::pair>, - std::unique_ptr>> - full_join(cudf::table_view const& probe, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe, - rmm::cuda_stream_view stream) const; - - [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe, - rmm::cuda_stream_view stream) const; - - std::size_t full_join_size(cudf::table_view const& probe, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - private: - template - std::pair>, - std::unique_ptr>> - compute_hash_join(cudf::table_view const& probe, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; - - /** - * @brief Probes the `_hash_table` built from `_build` for tuples in `probe_table`, - * and returns the output indices of `build_table` and `probe_table` as a combined table, - * i.e. if full join is specified as the join type then left join is called. Behavior - * is undefined if the provided `output_size` is smaller than the actual output size. - * - * @throw cudf::logic_error if hash table is null. - * - * @tparam JoinKind The type of join to be performed. - * - * @param probe_table Table of probe side columns to join. - * @param output_size Optional value which allows users to specify the exact output size. - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param mr Device memory resource used to allocate the returned vectors. - * - * @return Join output indices vector pair. - */ - template - std::pair>, - std::unique_ptr>> - probe_join_indices(cudf::table_view const& probe_table, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const; -}; - -} // namespace cudf diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu index 15aed83b641..5c529c88d9d 100644 --- a/cpp/src/join/join.cu +++ b/cpp/src/join/join.cu @@ -13,8 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "join/hash_join.cuh" -#include "join/join_common_utils.hpp" +#include "join_common_utils.hpp" #include #include @@ -26,6 +25,26 @@ namespace cudf { namespace detail { +namespace { +std::pair, std::unique_ptr
> get_empty_joined_table( + table_view const& probe, table_view const& build) +{ + std::unique_ptr
empty_probe = empty_like(probe); + std::unique_ptr
empty_build = empty_like(build); + return std::pair(std::move(empty_probe), std::move(empty_build)); +} + +std::unique_ptr combine_table_pair(std::unique_ptr&& left, + std::unique_ptr&& right) +{ + auto joined_cols = left->release(); + auto right_cols = right->release(); + joined_cols.insert(joined_cols.end(), + std::make_move_iterator(right_cols.begin()), + std::make_move_iterator(right_cols.end())); + return std::make_unique(std::move(joined_cols)); +} +} // namespace std::pair>, std::unique_ptr>> @@ -222,69 +241,8 @@ std::unique_ptr
full_join(table_view const& left_input, mr); return combine_table_pair(std::move(left_result), std::move(right_result)); } - } // namespace detail -hash_join::~hash_join() = default; - -hash_join::hash_join(cudf::table_view const& build, - null_equality compare_nulls, - rmm::cuda_stream_view stream) - : impl{std::make_unique(build, compare_nulls, stream)} -{ -} - -std::pair>, - std::unique_ptr>> -hash_join::inner_join(cudf::table_view const& probe, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const -{ - return impl->inner_join(probe, output_size, stream, mr); -} - -std::pair>, - std::unique_ptr>> -hash_join::left_join(cudf::table_view const& probe, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const -{ - return impl->left_join(probe, output_size, stream, mr); -} - -std::pair>, - std::unique_ptr>> -hash_join::full_join(cudf::table_view const& probe, - std::optional output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const -{ - return impl->full_join(probe, output_size, stream, mr); -} - -std::size_t hash_join::inner_join_size(cudf::table_view const& probe, - rmm::cuda_stream_view stream) const -{ - return impl->inner_join_size(probe, stream); -} - -std::size_t hash_join::left_join_size(cudf::table_view const& probe, - rmm::cuda_stream_view stream) const -{ - return impl->left_join_size(probe, stream); -} - -std::size_t hash_join::full_join_size(cudf::table_view const& probe, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) const -{ - return impl->full_join_size(probe, stream, mr); -} - -// external APIs - std::pair>, std::unique_ptr>> inner_join(table_view const& left, @@ -353,5 +311,4 @@ std::unique_ptr
full_join(table_view const& left, return detail::full_join( left, right, left_on, right_on, compare_nulls, rmm::cuda_stream_default, mr); } - } // namespace cudf diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index b778f13b5e1..fdb63419c84 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,8 +15,10 @@ */ #pragma once -#include +#include "join_common_utils.hpp" +#include +#include #include #include @@ -26,6 +28,41 @@ namespace cudf { namespace detail { +/** + * @brief Remaps a hash value to a new value if it is equal to the specified sentinel value. + * + * @param hash The hash value to potentially remap + * @param sentinel The reserved value + */ +template +constexpr auto remap_sentinel_hash(H hash, S sentinel) +{ + // Arbitrarily choose hash - 1 + return (hash == sentinel) ? (hash - 1) : hash; +} + +/** + * @brief Device functor to create a pair of hash value and index for a given row. + */ +class make_pair_function { + public: + CUDF_HOST_DEVICE make_pair_function(row_hash const& hash, + hash_value_type const empty_key_sentinel) + : _hash{hash}, _empty_key_sentinel{empty_key_sentinel} + { + } + + __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept + { + // Compute the hash value of row `i` + auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel); + return cuco::make_pair(row_hash_value, i); + } + + private: + row_hash _hash; + hash_value_type const _empty_key_sentinel; +}; /** * @brief Device functor to determine if a row is valid. @@ -98,6 +135,47 @@ get_trivial_left_join_indices( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Builds the hash table based on the given `build_table`. + * + * @tparam MultimapType The type of the hash table + * + * @param build Table of columns used to build join hash. + * @param hash_table Build hash table. + * @param nulls_equal Flag to denote nulls are equal or not. + * @param stream CUDA stream used for device memory operations and kernel launches. + * + */ +template +void build_join_hash_table(cudf::table_view const& build, + MultimapType& hash_table, + null_equality const nulls_equal, + rmm::cuda_stream_view stream) +{ + auto build_table_ptr = cudf::table_device_view::create(build, stream); + + CUDF_EXPECTS(0 != build_table_ptr->num_columns(), "Selected build dataset is empty"); + CUDF_EXPECTS(0 != build_table_ptr->num_rows(), "Build side table has no rows"); + + row_hash hash_build{nullate::DYNAMIC{cudf::has_nulls(build)}, *build_table_ptr}; + auto const empty_key_sentinel = hash_table.get_empty_key_sentinel(); + make_pair_function pair_func{hash_build, empty_key_sentinel}; + + auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func); + + size_type const build_table_num_rows{build_table_ptr->num_rows()}; + if (nulls_equal == cudf::null_equality::EQUAL or (not nullable(build))) { + hash_table.insert(iter, iter + build_table_num_rows, stream.value()); + } else { + thrust::counting_iterator stencil(0); + auto const row_bitmask = cudf::detail::bitmask_and(build, stream).first; + row_is_valid pred{static_cast(row_bitmask.data())}; + + // insert valid rows + hash_table.insert_if(iter, iter + build_table_num_rows, stencil, pred, stream.value()); + } +} + // Convenient alias for a pair of unique pointers to device uvectors. using VectorPair = std::pair>, std::unique_ptr>>; diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index 526c22d1d5c..060e8bff6f8 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -15,8 +15,10 @@ */ #pragma once +#include #include #include +#include #include #include @@ -34,7 +36,6 @@ namespace cudf { namespace detail { constexpr size_type MAX_JOIN_SIZE{std::numeric_limits::max()}; -constexpr int DEFAULT_JOIN_CG_SIZE = 2; constexpr int DEFAULT_JOIN_BLOCK_SIZE = 128; constexpr int DEFAULT_JOIN_CACHE_SIZE = 128; constexpr size_type JoinNoneValue = std::numeric_limits::min(); @@ -45,12 +46,7 @@ using hash_type = cuco::detail::MurmurHash3_32; using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor>; -using multimap_type = - cuco::static_multimap>; +using multimap_type = cudf::hash_join::impl_type::map_type; // Multimap type used for mixed joins. TODO: This is a temporary alias used // until the mixed joins are converted to using CGs properly. Right now it's @@ -68,9 +64,6 @@ using row_hash = cudf::row_hasher; using row_equality = cudf::row_equality_comparator; -enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN }; - bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type); - } // namespace detail } // namespace cudf diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu index 1eb2d4cf4a7..7fa6642b19f 100644 --- a/cpp/src/join/join_utils.cu +++ b/cpp/src/join/join_utils.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include "join_common_utils.cuh" #include diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index b540c013f47..27ee77e3edd 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -14,6 +14,10 @@ * limitations under the License. */ +#include "join_common_utils.cuh" +#include "join_common_utils.hpp" +#include "mixed_join_kernels.cuh" + #include #include #include @@ -23,12 +27,9 @@ #include #include #include -#include -#include -#include -#include #include +#include #include #include diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh index f7081cc4d63..38955ef4667 100644 --- a/cpp/src/join/mixed_join_kernel.cuh +++ b/cpp/src/join/mixed_join_kernel.cuh @@ -16,10 +16,9 @@ #pragma once -#include -#include -#include -#include +#include "join_common_utils.cuh" +#include "join_common_utils.hpp" +#include "mixed_join_common_utils.cuh" #include #include diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu index 60cc74991ef..13a1f1a0ce2 100644 --- a/cpp/src/join/mixed_join_semi.cu +++ b/cpp/src/join/mixed_join_semi.cu @@ -14,8 +14,14 @@ * limitations under the License. */ +#include "join_common_utils.cuh" +#include "join_common_utils.hpp" +#include "mixed_join_kernels_semi.cuh" + #include #include +#include +#include #include #include #include @@ -23,12 +29,9 @@ #include #include #include -#include -#include -#include -#include #include +#include #include #include diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 9eedc1a8015..ce70f7f18ee 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -14,10 +14,9 @@ * limitations under the License. */ -#include -#include -#include -#include +#include "join_common_utils.cuh" +#include "join_common_utils.hpp" +#include "mixed_join_common_utils.cuh" #include #include diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu index 9e1aa27a4e7..687e553fefd 100644 --- a/cpp/src/join/semi_join.cu +++ b/cpp/src/join/semi_join.cu @@ -45,7 +45,7 @@ namespace { /** * @brief Device functor to create a pair of hash value and index for a given row. */ -struct make_pair_function { +struct make_pair_fn { __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept { // The value is irrelevant since we only ever use the hash map to check for @@ -101,7 +101,7 @@ std::unique_ptr> left_semi_anti_join( auto const right_nulls = cudf::nullate::DYNAMIC{cudf::has_nulls(right_flattened_keys)}; row_hash const hash_build{right_nulls, *right_rows_d}; row_equality equality_build{right_nulls, *right_rows_d, *right_rows_d, compare_nulls}; - make_pair_function pair_func_build{}; + make_pair_fn pair_func_build{}; auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build); From 9ac24773d186c22ffbacbe31d92dad60ed2cdb5f Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 28 Apr 2022 21:52:46 +0530 Subject: [PATCH 08/28] Implement all methods of groupby rank aggregation in libcudf, python (#9569) Addresses part of https://github.com/rapidsai/cudf/issues/3591 - [x] move RANK (min method), DENSE_RANK (dense method) into single RANK aggregation - [x] max method - [x] average method - [x] first method - [x] percentage - [x] order, null order RANK, DENSE_RANK was implemented for spark requirement. Pandas groupby has 3 more methods. `rank(column_view, rank_method)` already has all 5 methods implemented. Current implementation has 2 separate aggregations RANK and DENSE_RANK. This is merged to single RANK with parameters `rank_aggregation(rank_method method, null_policy null_handling, bool percentage)` Groupby.rank support for 3 more methods will be added. This PR is also pre-requisite for spearman correlation. Additionally - [x] Cython, Python plumbing - [x] benchmark for groupby rank (all methods) - [x] PERCENT_RANK aggregation is replaced with MIN_0_INDEXED rank_method in RANK aggregation Authors: - Karthikeyan (https://github.com/karthikeyann) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - MithunR (https://github.com/mythrocks) - Jake Hemstad (https://github.com/jrhemstad) URL: https://github.com/rapidsai/cudf/pull/9569 --- cpp/benchmarks/CMakeLists.txt | 12 +- .../groupby/group_rank_benchmark.cu | 109 +++++++ cpp/include/cudf/aggregation.hpp | 186 ++++-------- .../cudf/detail/aggregation/aggregation.hpp | 92 +++--- cpp/include/cudf/detail/scan.hpp | 9 +- cpp/include/cudf/sorting.hpp | 16 +- cpp/src/aggregation/aggregation.cpp | 79 ++--- cpp/src/groupby/groupby.cu | 13 +- cpp/src/groupby/sort/functors.hpp | 3 +- cpp/src/groupby/sort/group_rank_scan.cu | 287 ++++++++++++++---- cpp/src/groupby/sort/group_scan.hpp | 99 ++++-- cpp/src/groupby/sort/scan.cpp | 113 ++++--- cpp/src/reductions/scan/rank_scan.cu | 9 +- cpp/src/reductions/scan/scan.cpp | 22 +- cpp/tests/groupby/rank_scan_tests.cpp | 176 ++++++----- cpp/tests/reductions/list_rank_test.cpp | 63 ++-- cpp/tests/reductions/rank_tests.cpp | 21 +- cpp/tests/reductions/scan_tests.cpp | 1 + java/src/main/native/src/AggregationJni.cpp | 11 +- python/cudf/cudf/_lib/aggregation.pyx | 41 ++- python/cudf/cudf/_lib/cpp/aggregation.pxd | 24 ++ python/cudf/cudf/_lib/cpp/sorting.pxd | 9 +- python/cudf/cudf/_lib/groupby.pyx | 2 +- python/cudf/cudf/_lib/sort.pxd | 3 - python/cudf/cudf/_lib/sort.pyx | 14 +- python/cudf/cudf/core/groupby/groupby.py | 24 ++ python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 44 +++ 28 files changed, 944 insertions(+), 540 deletions(-) create mode 100644 cpp/benchmarks/groupby/group_rank_benchmark.cu delete mode 100644 python/cudf/cudf/_lib/sort.pxd diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 26bb10da69f..e93b2bf4f25 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -194,10 +194,18 @@ ConfigureBench(FILL_BENCH filling/repeat.cpp) # ################################################################################################## # * groupby benchmark ----------------------------------------------------------------------------- ConfigureBench( - GROUPBY_BENCH groupby/group_sum.cu groupby/group_nth.cu groupby/group_shift.cu - groupby/group_struct.cu groupby/group_no_requests.cu groupby/group_scan.cu + GROUPBY_BENCH + groupby/group_sum.cu + groupby/group_nth.cu + groupby/group_shift.cu + groupby/group_struct.cu + groupby/group_no_requests.cu + groupby/group_scan.cu + groupby/group_rank_benchmark.cu ) +ConfigureNVBench(GROUPBY_NVBENCH groupby/group_rank_benchmark.cu) + # ################################################################################################## # * hashing benchmark ----------------------------------------------------------------------------- ConfigureBench(HASHING_BENCH hashing/hash.cpp hashing/partition.cpp) diff --git a/cpp/benchmarks/groupby/group_rank_benchmark.cu b/cpp/benchmarks/groupby/group_rank_benchmark.cu new file mode 100644 index 00000000000..1eeb15debe9 --- /dev/null +++ b/cpp/benchmarks/groupby/group_rank_benchmark.cu @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include +#include +#include +#include + +#include + +template +static void nvbench_groupby_rank(nvbench::state& state, + nvbench::type_list>) +{ + using namespace cudf; + using type = int64_t; + constexpr auto dtype = type_to_id(); + cudf::rmm_pool_raii pool_raii; + + bool const is_sorted = state.get_int64("is_sorted"); + cudf::size_type const column_size = state.get_int64("data_size"); + constexpr int num_groups = 100; + + data_profile profile; + profile.set_null_frequency(std::nullopt); + profile.set_cardinality(0); + profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, num_groups); + + auto source_table = create_random_table({dtype, dtype}, row_count{column_size}, profile); + + // values to be pre-sorted too for groupby rank + if (is_sorted) source_table = cudf::sort(*source_table); + + table_view keys{{source_table->view().column(0)}}; + column_view order_by{source_table->view().column(1)}; + + auto agg = cudf::make_rank_aggregation(method); + std::vector requests; + requests.emplace_back(groupby::scan_request()); + requests[0].values = order_by; + requests[0].aggregations.push_back(std::move(agg)); + + groupby::groupby gb_obj(keys, null_policy::EXCLUDE, is_sorted ? sorted::YES : sorted::NO); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + // groupby scan uses sort implementation + auto result = gb_obj.scan(requests); + }); +} + +enum class rank_method : int32_t {}; + +NVBENCH_DECLARE_ENUM_TYPE_STRINGS( + cudf::rank_method, + [](cudf::rank_method value) { + switch (value) { + case cudf::rank_method::FIRST: return "FIRST"; + case cudf::rank_method::AVERAGE: return "AVERAGE"; + case cudf::rank_method::MIN: return "MIN"; + case cudf::rank_method::MAX: return "MAX"; + case cudf::rank_method::DENSE: return "DENSE"; + default: return "unknown"; + } + }, + [](cudf::rank_method value) { + switch (value) { + case cudf::rank_method::FIRST: return "cudf::rank_method::FIRST"; + case cudf::rank_method::AVERAGE: return "cudf::rank_method::AVERAGE"; + case cudf::rank_method::MIN: return "cudf::rank_method::MIN"; + case cudf::rank_method::MAX: return "cudf::rank_method::MAX"; + case cudf::rank_method::DENSE: return "cudf::rank_method::DENSE"; + default: return "unknown"; + } + }) + +using methods = nvbench::enum_type_list; + +NVBENCH_BENCH_TYPES(nvbench_groupby_rank, NVBENCH_TYPE_AXES(methods)) + .set_type_axes_names({"rank_method"}) + .set_name("groupby_rank") + .add_int64_axis("data_size", + { + 1000000, // 1M + 10000000, // 10M + 100000000, // 100M + }) + + .add_int64_axis("is_sorted", {0, 1}); diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 539a7c04106..5c7513a6c99 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -43,6 +43,32 @@ namespace detail { class simple_aggregations_collector; class aggregation_finalizer; } // namespace detail + +/** + * @brief Tie-breaker method to use for ranking the column. + * + * @see cudf::make_rank_aggregation for more details. + * @ingroup column_sort + */ +enum class rank_method : int32_t { + FIRST, ///< stable sort order ranking (no ties) + AVERAGE, ///< mean of first in the group + MIN, ///< min of first in the group + MAX, ///< max of first in the group + DENSE ///< rank always increases by 1 between groups +}; + +/** + * @brief Whether returned rank should be percentage or not and + * mention the type of percentage normalization. + * + */ +enum class rank_percentage : int32_t { + NONE, ///< rank + ZERO_NORMALIZED, ///< rank / count + ONE_NORMALIZED ///< (rank - 1) / (count - 1) +}; + /** * @brief Abstract base class for specifying the desired aggregation in an * `aggregation_request`. @@ -77,9 +103,7 @@ class aggregation { NUNIQUE, ///< count number of unique elements NTH_ELEMENT, ///< get the nth element ROW_NUMBER, ///< get row-number of current index (relative to rolling window) - RANK, ///< get rank of current index - DENSE_RANK, ///< get dense rank of current index - PERCENT_RANK, ///< get percent (i.e. fractional) rank of current index + RANK, ///< get rank of current index COLLECT_LIST, ///< collect values into a list COLLECT_SET, ///< collect values into a list without duplicate entries LEAD, ///< window function, accesses row at specified offset following current row @@ -323,9 +347,11 @@ std::unique_ptr make_row_number_aggregation(); /** * @brief Factory to create a RANK aggregation * - * `RANK` returns a non-nullable column of size_type "ranks": the number of rows preceding or - * equal to the current row plus one. As a result, ranks are not unique and gaps will appear in - * the ranking sequence. + * `RANK` returns a column of size_type or double "ranks" (see note 3 below for how the + * data type is determined) for a given rank method and column order. + * If nulls are excluded, the rank will be null for those rows, otherwise a non-nullable column is + * returned. Double precision column is returned only when percentage!=NONE and when rank method is + * average. * * This aggregation only works with "scan" algorithms. The input column into the group or * ungrouped scan is an orderby column that orders the rows that the aggregate function ranks. @@ -333,10 +359,12 @@ std::unique_ptr make_row_number_aggregation(); * column containing the ordering columns. * * Note: - * 1. This method requires that the rows are presorted by the group keys and order_by columns. - * 2. `RANK` aggregations will return a fully valid column regardless of null_handling policy - * specified in the scan. - * 3. `RANK` aggregations are not compatible with exclusive scans. + * 1. This method could work faster with the rows that are presorted by the group keys and order_by + * columns. Though groupby object does not require order_by column to be sorted, groupby rank + * scan aggregation does require the order_by column to be sorted if the keys are sorted. + * 2. `RANK` aggregations are not compatible with exclusive scans. + * 3. All rank methods except AVERAGE method and percentage!=NONE returns size_type column. + * For AVERAGE method and percentage!=NONE, the return type is double column. * * @code{.pseudo} * Example: Consider a motor-racing statistics dataset, containing the following columns: @@ -362,123 +390,37 @@ std::unique_ptr make_row_number_aggregation(); * A grouped rank aggregation scan with: * groupby column : venue * input orderby column: time - * Produces the following rank column: - * { 1, 2, 3, 3, 5, 1, 2, 2, 4, 5} - * (This corresponds to the following grouping and `driver` rows:) - * { "HAM", "LEC", "BOT", "NOR", "RIC", "RIC", "NOR", "BOT", "LEC", "PER" } - * <----------silverstone----------->|<-------------monza--------------> - * @endcode - */ -template -std::unique_ptr make_rank_aggregation(); - -/** - * @brief Factory to create a DENSE_RANK aggregation - * - * `DENSE_RANK` returns a non-nullable column of size_type "dense ranks": the preceding unique - * value's rank plus one. As a result, ranks are not unique but there are no gaps in the ranking - * sequence (unlike RANK aggregations). - * - * This aggregation only works with "scan" algorithms. The input column into the group or - * ungrouped scan is an orderby column that orders the rows that the aggregate function ranks. - * If rows are ordered by more than one column, the orderby input column should be a struct - * column containing the ordering columns. - * - * Note: - * 1. This method requires that the rows are presorted by the group keys and order_by columns. - * 2. `DENSE_RANK` aggregations will return a fully valid column regardless of null_handling - * policy specified in the scan. - * 3. `DENSE_RANK` aggregations are not compatible with exclusive scans. - * - * @code{.pseudo} - * Example: Consider a motor-racing statistics dataset, containing the following columns: - * 1. venue: (STRING) Location of the race event - * 2. driver: (STRING) Name of the car driver (abbreviated to 3 characters) - * 3. time: (INT32) Time taken to complete the circuit - * - * For the following presorted data: + * Produces the following rank column for each methods: + * first: { 1, 2, 3, 4, 5, 1, 2, 3, 4, 5} + * average: { 1, 2, 3.5, 3.5, 5, 1, 2.5, 2.5, 4, 5} + * min: { 1, 2, 3, 3, 5, 1, 2, 2, 4, 5} + * max: { 1, 2, 4, 4, 5, 1, 3, 3, 4, 5} + * dense: { 1, 2, 3, 3, 4, 1, 2, 2, 3, 4} + * This corresponds to the following grouping and `driver` rows: + * { "HAM", "LEC", "BOT", "NOR", "RIC", "RIC", "NOR", "BOT", "LEC", "PER" } + * <----------silverstone----------->|<-------------monza--------------> + * + * min rank for each percentage types: + * NONE: { 1, 2, 3, 3, 5, 1, 2, 2, 4, 5 } + * ZERO_NORMALIZED : { 0.16, 0.33, 0.50, 0.50, 0.83, 0.16, 0.33, 0.33, 0.66, 0.83 } + * ONE_NORMALIZED: { 0.00, 0.25, 0.50, 0.50, 1.00, 0.00, 0.25, 0.25, 0.75, 1.00 } + * where count corresponds to the number of rows in the group. @see cudf::rank_percentage * - * [ // venue, driver, time - * { "silverstone", "HAM" ("hamilton"), 15823}, - * { "silverstone", "LEC" ("leclerc"), 15827}, - * { "silverstone", "BOT" ("bottas"), 15834}, // <-- Tied for 3rd place. - * { "silverstone", "NOR" ("norris"), 15834}, // <-- Tied for 3rd place. - * { "silverstone", "RIC" ("ricciardo"), 15905}, - * { "monza", "RIC" ("ricciardo"), 12154}, - * { "monza", "NOR" ("norris"), 12156}, // <-- Tied for 2nd place. - * { "monza", "BOT" ("bottas"), 12156}, // <-- Tied for 2nd place. - * { "monza", "LEC" ("leclerc"), 12201}, - * { "monza", "PER" ("perez"), 12203} - * ] - * - * A grouped dense rank aggregation scan with: - * groupby column : venue - * input orderby column: time - * Produces the following dense rank column: - * { 1, 2, 3, 3, 4, 1, 2, 2, 3, 4} - * (This corresponds to the following grouping and `driver` rows:) - * { "HAM", "LEC", "BOT", "NOR", "RIC", "RIC", "NOR", "BOT", "LEC", "PER" } - * <----------silverstone----------->|<-------------monza--------------> * @endcode - */ -template -std::unique_ptr make_dense_rank_aggregation(); - -/** - * @brief Factory to create a PERCENT_RANK aggregation * - * `PERCENT_RANK` returns a non-nullable column of double precision "fractional" ranks. - * For row index `i`, the percent rank of row `i` is defined as: - * percent_rank = (rank - 1) / (group_row_count - 1) - * where, - * 1. rank is the `RANK` of the row within the group - * 2. group_row_count is the number of rows in the group - * - * This aggregation only works with "scan" algorithms. The input to the grouped or - * ungrouped scan is an orderby column that orders the rows that the aggregate function ranks. - * If rows are ordered by more than one column, the orderby input column should be a struct - * column containing the ordering columns. - * - * Note: - * 1. This method requires that the rows are presorted by the group keys and order_by columns. - * 2. `PERCENT_RANK` aggregations will return a fully valid column regardless of null_handling - * policy specified in the scan. - * 3. `PERCENT_RANK` aggregations are not compatible with exclusive scans. - * - * @code{.pseudo} - * Example: Consider a motor-racing statistics dataset, containing the following columns: - * 1. venue: (STRING) Location of the race event - * 2. driver: (STRING) Name of the car driver (abbreviated to 3 characters) - * 3. time: (INT32) Time taken to complete the circuit - * - * For the following presorted data: - * - * [ // venue, driver, time - * { "silverstone", "HAM" ("hamilton"), 15823}, - * { "silverstone", "LEC" ("leclerc"), 15827}, - * { "silverstone", "BOT" ("bottas"), 15834}, // <-- Tied for 3rd place. - * { "silverstone", "NOR" ("norris"), 15834}, // <-- Tied for 3rd place. - * { "silverstone", "RIC" ("ricciardo"), 15905}, - * { "monza", "RIC" ("ricciardo"), 12154}, - * { "monza", "NOR" ("norris"), 12156}, // <-- Tied for 2nd place. - * { "monza", "BOT" ("bottas"), 12156}, // <-- Tied for 2nd place. - * { "monza", "LEC" ("leclerc"), 12201}, - * { "monza", "PER" ("perez"), 12203} - * ] - * - * A grouped percent rank aggregation scan with: - * groupby column : venue - * input orderby column: time - * Produces the following percent rank column: - * { 0.00, 0.25, 0.50, 0.50, 1.00, 0.00, 0.25, 0.25, 0.75, 1.00 } - * - * (This corresponds to the following grouping and `driver` rows:) - * { "HAM", "LEC", "BOT", "NOR", "RIC", "RIC", "NOR", "BOT", "LEC", "PER" } - * <----------silverstone----------->|<-------------monza--------------> - * @endcode + * @param method The ranking method used for tie breaking (same values). + * @param column_order The desired sort order for ranking + * @param null_handling flag to include nulls during ranking. If nulls are not included, + * the corresponding rank will be null. + * @param null_precedence The desired order of null compared to other elements for column + * @param percentage enum to denote the type of conversion of ranks to percentage in range (0,1] */ template -std::unique_ptr make_percent_rank_aggregation(); +std::unique_ptr make_rank_aggregation(rank_method method, + order column_order = order::ASCENDING, + null_policy null_handling = null_policy::EXCLUDE, + null_order null_precedence = null_order::AFTER, + rank_percentage percentage = rank_percentage::NONE); /** * @brief Factory to create a COLLECT_LIST aggregation diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 886151fb9d6..8ca49dd7d5f 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -75,10 +75,6 @@ class simple_aggregations_collector { // Declares the interface for the simple class row_number_aggregation const& agg); virtual std::vector> visit(data_type col_type, class rank_aggregation const& agg); - virtual std::vector> visit(data_type col_type, - class dense_rank_aggregation const& agg); - virtual std::vector> visit( - data_type col_type, class percent_rank_aggregation const& agg); virtual std::vector> visit( data_type col_type, class collect_list_aggregation const& agg); virtual std::vector> visit(data_type col_type, @@ -127,8 +123,6 @@ class aggregation_finalizer { // Declares the interface for the finalizer virtual void visit(class nth_element_aggregation const& agg); virtual void visit(class row_number_aggregation const& agg); virtual void visit(class rank_aggregation const& agg); - virtual void visit(class dense_rank_aggregation const& agg); - virtual void visit(class percent_rank_aggregation const& agg); virtual void visit(class collect_list_aggregation const& agg); virtual void visit(class collect_set_aggregation const& agg); virtual void visit(class lead_lag_aggregation const& agg); @@ -642,32 +636,42 @@ class rank_aggregation final : public rolling_aggregation, public groupby_scan_aggregation, public scan_aggregation { public: - rank_aggregation() : aggregation{RANK} {} - - [[nodiscard]] std::unique_ptr clone() const override + rank_aggregation(rank_method method, + order column_order, + null_policy null_handling, + null_order null_precedence, + rank_percentage percentage) + : aggregation{RANK}, + _method{method}, + _column_order{column_order}, + _null_handling{null_handling}, + _null_precedence{null_precedence}, + _percentage(percentage) { - return std::make_unique(*this); } - std::vector> get_simple_aggregations( - data_type col_type, simple_aggregations_collector& collector) const override + rank_method const _method; ///< rank method + order const _column_order; ///< order of the column to rank + null_policy const _null_handling; ///< include or exclude nulls in ranks + null_order const _null_precedence; ///< order of nulls in ranks + rank_percentage const _percentage; ///< whether to return percentage ranks + + [[nodiscard]] bool is_equal(aggregation const& _other) const override { - return collector.visit(col_type, *this); + if (!this->aggregation::is_equal(_other)) { return false; } + auto const& other = dynamic_cast(_other); + return _method == other._method and _null_handling == other._null_handling and + _column_order == other._column_order and _null_precedence == other._null_precedence and + _percentage == other._percentage; } - void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } -}; -/** - * @brief Derived class for specifying a dense rank aggregation - */ -class dense_rank_aggregation final : public rolling_aggregation, - public groupby_scan_aggregation, - public scan_aggregation { - public: - dense_rank_aggregation() : aggregation{DENSE_RANK} {} + [[nodiscard]] size_t do_hash() const override + { + return this->aggregation::do_hash() ^ hash_impl(); + } [[nodiscard]] std::unique_ptr clone() const override { - return std::make_unique(*this); + return std::make_unique(*this); } std::vector> get_simple_aggregations( data_type col_type, simple_aggregations_collector& collector) const override @@ -675,24 +679,16 @@ class dense_rank_aggregation final : public rolling_aggregation, return collector.visit(col_type, *this); } void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } -}; - -class percent_rank_aggregation final : public rolling_aggregation, - public groupby_scan_aggregation, - public scan_aggregation { - public: - percent_rank_aggregation() : aggregation{PERCENT_RANK} {} - [[nodiscard]] std::unique_ptr clone() const override - { - return std::make_unique(*this); - } - std::vector> get_simple_aggregations( - data_type col_type, simple_aggregations_collector& collector) const override + private: + [[nodiscard]] size_t hash_impl() const { - return collector.visit(col_type, *this); + return std::hash{}(static_cast(_method)) ^ + std::hash{}(static_cast(_column_order)) ^ + std::hash{}(static_cast(_null_handling)) ^ + std::hash{}(static_cast(_null_precedence)) ^ + std::hash{}(static_cast(_percentage)); } - void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); } }; /** @@ -1278,19 +1274,7 @@ struct target_type_impl { // Always use size_type accumulator for RANK template struct target_type_impl { - using type = size_type; -}; - -// Always use size_type accumulator for DENSE_RANK -template -struct target_type_impl { - using type = size_type; -}; - -// Always use double for PERCENT_RANK -template -struct target_type_impl { - using type = double; + using type = size_type; // double for percentage=true. }; // Always use list for COLLECT_LIST @@ -1453,10 +1437,6 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind return f.template operator()(std::forward(args)...); case aggregation::RANK: return f.template operator()(std::forward(args)...); - case aggregation::DENSE_RANK: - return f.template operator()(std::forward(args)...); - case aggregation::PERCENT_RANK: - return f.template operator()(std::forward(args)...); case aggregation::COLLECT_LIST: return f.template operator()(std::forward(args)...); case aggregation::COLLECT_SET: diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp index fc829617c2d..13dddd3b0c8 100644 --- a/cpp/include/cudf/detail/scan.hpp +++ b/cpp/include/cudf/detail/scan.hpp @@ -103,16 +103,17 @@ std::unique_ptr inclusive_dense_rank_scan(column_view const& order_by, rmm::mr::device_memory_resource* mr); /** - * @brief Generate row percent ranks for a column. + * @brief Generate row ONE_NORMALIZED percent ranks for a column. + * Also, knowns as ANSI SQL PERCENT RANK. + * Calculated by (rank - 1) / (count - 1). * * @param order_by Input column to generate ranks for. * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate the returned column's device memory. * @return rank values. */ -std::unique_ptr inclusive_percent_rank_scan(column_view const& order_by, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr inclusive_one_normalized_percent_rank_scan( + column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp index ff334b9ee85..b7e915650dc 100644 --- a/cpp/include/cudf/sorting.hpp +++ b/cpp/include/cudf/sorting.hpp @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -23,19 +24,6 @@ namespace cudf { -/** - * @brief Tie-breaker method to use for ranking the column. - * - * @ingroup column_sort - */ -enum class rank_method { - FIRST, ///< stable sort order ranking (no ties) - AVERAGE, ///< mean of first in the group - MIN, ///< min of first in the group - MAX, ///< max of first in the group - DENSE ///< rank always increases by 1 between groups -}; - /** * @addtogroup column_sort * @{ @@ -198,7 +186,7 @@ std::unique_ptr
stable_sort_by_key( * included, corresponding rank will be null. * @param null_precedence The desired order of null compared to other elements * for column - * @param percentage flag to convert ranks to percentage in range (0,1} + * @param percentage flag to convert ranks to percentage in range (0,1] * @param mr Device memory resource used to allocate the returned column's device memory * @return std::unique_ptr A column of containing the rank of the each * element of the column of `input`. The output column type will be `size_type` diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 8fedf641c8f..27732b25401 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -154,18 +154,6 @@ std::vector> simple_aggregations_collector::visit( return visit(col_type, static_cast(agg)); } -std::vector> simple_aggregations_collector::visit( - data_type col_type, dense_rank_aggregation const& agg) -{ - return visit(col_type, static_cast(agg)); -} - -std::vector> simple_aggregations_collector::visit( - data_type col_type, percent_rank_aggregation const& agg) -{ - return visit(col_type, static_cast(agg)); -} - std::vector> simple_aggregations_collector::visit( data_type col_type, collect_list_aggregation const& agg) { @@ -334,16 +322,6 @@ void aggregation_finalizer::visit(rank_aggregation const& agg) visit(static_cast(agg)); } -void aggregation_finalizer::visit(dense_rank_aggregation const& agg) -{ - visit(static_cast(agg)); -} - -void aggregation_finalizer::visit(percent_rank_aggregation const& agg) -{ - visit(static_cast(agg)); -} - void aggregation_finalizer::visit(collect_list_aggregation const& agg) { visit(static_cast(agg)); @@ -644,36 +622,33 @@ template std::unique_ptr make_row_number_aggregation -std::unique_ptr make_rank_aggregation() -{ - return std::make_unique(); -} -template std::unique_ptr make_rank_aggregation(); -template std::unique_ptr -make_rank_aggregation(); -template std::unique_ptr make_rank_aggregation(); - -/// Factory to create a DENSE_RANK aggregation -template -std::unique_ptr make_dense_rank_aggregation() -{ - return std::make_unique(); -} -template std::unique_ptr make_dense_rank_aggregation(); -template std::unique_ptr -make_dense_rank_aggregation(); -template std::unique_ptr make_dense_rank_aggregation(); - -/// Factory to create a PERCENT_RANK aggregation -template -std::unique_ptr make_percent_rank_aggregation() -{ - return std::make_unique(); -} -template std::unique_ptr make_percent_rank_aggregation(); -template std::unique_ptr -make_percent_rank_aggregation(); -template std::unique_ptr make_percent_rank_aggregation(); +std::unique_ptr make_rank_aggregation(rank_method method, + order column_order, + null_policy null_handling, + null_order null_precedence, + rank_percentage percentage) +{ + return std::make_unique( + method, column_order, null_handling, null_precedence, percentage); +} +template std::unique_ptr make_rank_aggregation( + rank_method method, + order column_order, + null_policy null_handling, + null_order null_precedence, + rank_percentage percentage); +template std::unique_ptr make_rank_aggregation( + rank_method method, + order column_order, + null_policy null_handling, + null_order null_precedence, + rank_percentage percentage); +template std::unique_ptr make_rank_aggregation( + rank_method method, + order column_order, + null_policy null_handling, + null_order null_precedence, + rank_percentage percentage); /// Factory to create a COLLECT_LIST aggregation template diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 79882239b38..a002b0bb744 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -101,9 +101,12 @@ namespace { * Adds special handling for COLLECT_LIST/COLLECT_SET, because: * 1. `make_empty_column()` does not support construction of nested columns. * 2. Empty lists need empty child columns, to persist type information. + * Adds special handling for RANK, because it needs to return double type column when rank_method is + * AVERAGE or percentage is true. */ struct empty_column_constructor { column_view values; + aggregation const& agg; template std::unique_ptr operator()() const @@ -116,6 +119,14 @@ struct empty_column_constructor { 0, make_empty_column(type_to_id()), empty_like(values), 0, {}); } + if constexpr (k == aggregation::Kind::RANK) { + auto const& rank_agg = dynamic_cast(agg); + if (rank_agg._method == cudf::rank_method::AVERAGE or + rank_agg._percentage != rank_percentage::NONE) + return make_empty_column(type_to_id()); + return make_empty_column(target_type(values.type(), k)); + } + // If `values` is LIST typed, and the aggregation results match the type, // construct empty results based on `values`. // Most generally, this applies if input type matches output type. @@ -148,7 +159,7 @@ auto empty_results(host_span requests) std::back_inserter(results), [&request](auto const& agg) { return cudf::detail::dispatch_type_and_aggregation( - request.values.type(), agg->kind, empty_column_constructor{request.values}); + request.values.type(), agg->kind, empty_column_constructor{request.values, *agg}); }); return aggregation_result{std::move(results)}; diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp index fa3d19bdcfd..748e34a583d 100644 --- a/cpp/src/groupby/sort/functors.hpp +++ b/cpp/src/groupby/sort/functors.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -87,7 +87,6 @@ struct store_result_functor { */ column_view get_sorted_values() { - if (is_presorted()) { return values; } return sorted_values ? sorted_values->view() : (sorted_values = helper.sorted_values(values, stream))->view(); }; diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu index 77d68edaa3a..0b25ab9a33d 100644 --- a/cpp/src/groupby/sort/group_rank_scan.cu +++ b/cpp/src/groupby/sort/group_rank_scan.cu @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,7 @@ #include #include +#include #include #include #include @@ -35,23 +37,59 @@ namespace cudf { namespace groupby { namespace detail { namespace { + +/** + * @brief Functor to compare two rows of a table in given permutation order + * This is useful to identify unique elements in a sorted order table, when the permutation order is + * the sorted order of the table. + * + */ +template +struct permuted_comparator { + /** + * @brief comparator object which compares two rows of the table in given permutation order + * + * @param device_table Device table to compare + * @param permutation The permutation order, integer type column. + * @param has_nulls whether the table has nulls + */ + permuted_comparator(table_device_view device_table, Iterator const permutation, bool has_nulls) + : comparator(nullate::DYNAMIC{has_nulls}, device_table, device_table, null_equality::EQUAL), + permutation(permutation) + { + } + __device__ bool operator()(size_type index1, size_type index2) const + { + return comparator(permutation[index1], permutation[index2]); + }; + + private: + row_equality_comparator comparator; + Iterator const permutation; +}; + /** * @brief generate grouped row ranks or dense ranks using a row comparison then scan the results * + * @tparam forward true if the rank scan computation should use forward iterator traversal (default) + * else reverse iterator traversal * @tparam value_resolver flag value resolver function with boolean first and row number arguments * @tparam scan_operator scan function ran on the flag values - * @param order_by input column to generate ranks for + * @param grouped_values input column to generate ranks for + * @param value_order column of type INT32 that contains the order of the values in the + * grouped_values column * @param group_labels ID of group that the corresponding value belongs to * @param group_offsets group index offsets with group ID indices * @param resolver flag value resolver * @param scan_op scan operation ran on the flag results - * @param has_nulls true if nulls are included in the `order_by` column + * @param has_nulls true if nulls are included in the `grouped_values` column * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return std::unique_ptr rank values */ -template -std::unique_ptr rank_generator(column_view const& order_by, +template +std::unique_ptr rank_generator(column_view const& grouped_values, + column_view const& value_order, device_span group_labels, device_span group_offsets, value_resolver resolver, @@ -61,10 +99,11 @@ std::unique_ptr rank_generator(column_view const& order_by, rmm::mr::device_memory_resource* mr) { auto const flattened = cudf::structs::detail::flatten_nested_columns( - table_view{{order_by}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING); + table_view{{grouped_values}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING); auto const d_flat_order = table_device_view::create(flattened, stream); - row_equality_comparator comparator( - nullate::DYNAMIC{has_nulls}, *d_flat_order, *d_flat_order, null_equality::EQUAL); + auto sorted_index_order = value_order.begin(); + auto comparator = permuted_comparator(*d_flat_order, sorted_index_order, has_nulls); + auto ranks = make_fixed_width_column(data_type{type_to_id()}, flattened.flattened_columns().num_rows(), mask_state::UNALLOCATED, @@ -72,100 +111,218 @@ std::unique_ptr rank_generator(column_view const& order_by, mr); auto mutable_ranks = ranks->mutable_view(); - thrust::tabulate( - rmm::exec_policy(stream), - mutable_ranks.begin(), - mutable_ranks.end(), - [comparator, resolver, labels = group_labels.data(), offsets = group_offsets.data()] __device__( - size_type row_index) { - auto group_start = offsets[labels[row_index]]; + auto unique_identifier = [labels = group_labels.begin(), + offsets = group_offsets.begin(), + comparator, + resolver] __device__(size_type row_index) { + auto const group_start = offsets[labels[row_index]]; + if constexpr (forward) { + // First value of equal values is 1. return resolver(row_index == group_start || !comparator(row_index, row_index - 1), row_index - group_start); - }); + } else { + auto const group_end = offsets[labels[row_index] + 1]; + // Last value of equal values is 1. + return resolver(row_index + 1 == group_end || !comparator(row_index, row_index + 1), + row_index - group_start); + } + }; + thrust::tabulate(rmm::exec_policy(stream), + mutable_ranks.begin(), + mutable_ranks.end(), + unique_identifier); + auto [group_labels_begin, mutable_rank_begin] = [&]() { + if constexpr (forward) { + return thrust::pair{group_labels.begin(), mutable_ranks.begin()}; + } else { + return thrust::pair{thrust::reverse_iterator(group_labels.end()), + thrust::reverse_iterator(mutable_ranks.end())}; + } + }(); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - group_labels.begin(), - group_labels.end(), - mutable_ranks.begin(), - mutable_ranks.begin(), + group_labels_begin, + group_labels_begin + group_labels.size(), + mutable_rank_begin, + mutable_rank_begin, thrust::equal_to{}, scan_op); - return ranks; } } // namespace -std::unique_ptr rank_scan(column_view const& order_by, - device_span group_labels, - device_span group_offsets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr min_rank_scan(column_view const& grouped_values, + column_view const& value_order, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return rank_generator( - order_by, + return rank_generator( + grouped_values, + value_order, group_labels, group_offsets, [] __device__(bool unequal, auto row_index_in_group) { return unequal ? row_index_in_group + 1 : 0; }, DeviceMax{}, - has_nested_nulls(table_view{{order_by}}), + has_nested_nulls(table_view{{grouped_values}}), stream, mr); } -std::unique_ptr dense_rank_scan(column_view const& order_by, - device_span group_labels, - device_span group_offsets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr max_rank_scan(column_view const& grouped_values, + column_view const& value_order, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { - return rank_generator( - order_by, + return rank_generator( + grouped_values, + value_order, group_labels, group_offsets, - [] __device__(bool const unequal, size_type const) { return unequal ? 1 : 0; }, - DeviceSum{}, - has_nested_nulls(table_view{{order_by}}), + [] __device__(bool unequal, auto row_index_in_group) { + return unequal ? row_index_in_group + 1 : std::numeric_limits::max(); + }, + DeviceMin{}, + has_nested_nulls(table_view{{grouped_values}}), stream, mr); } -std::unique_ptr percent_rank_scan(column_view const& order_by, +std::unique_ptr first_rank_scan(column_view const& grouped_values, + column_view const&, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto ranks = make_fixed_width_column( + data_type{type_to_id()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr); + auto mutable_ranks = ranks->mutable_view(); + thrust::tabulate(rmm::exec_policy(stream), + mutable_ranks.begin(), + mutable_ranks.end(), + [labels = group_labels.begin(), + offsets = group_offsets.begin()] __device__(size_type row_index) { + auto group_start = offsets[labels[row_index]]; + return row_index - group_start + 1; + }); + return ranks; +} + +std::unique_ptr average_rank_scan(column_view const& grouped_values, + column_view const& value_order, device_span group_labels, device_span group_offsets, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto const rank_column = rank_scan( - order_by, group_labels, group_offsets, stream, rmm::mr::get_current_device_resource()); - auto const rank_view = rank_column->view(); - auto const group_size_iter = cudf::detail::make_counting_transform_iterator( - 0, - [labels = group_labels.begin(), - offsets = group_offsets.begin()] __device__(size_type row_index) { - auto const group_label = labels[row_index]; - auto const group_start = offsets[group_label]; - auto const group_end = offsets[group_label + 1]; - return group_end - group_start; - }); - - // Result type for PERCENT_RANK is independent of input type. - using result_type = cudf::detail::target_type_t; - - auto percent_rank_result = cudf::make_fixed_width_column( - data_type{type_to_id()}, rank_view.size(), mask_state::UNALLOCATED, stream, mr); - + auto max_rank = max_rank_scan(grouped_values, + value_order, + group_labels, + group_offsets, + stream, + rmm::mr::get_current_device_resource()); + auto min_rank = min_rank_scan(grouped_values, + value_order, + group_labels, + group_offsets, + stream, + rmm::mr::get_current_device_resource()); + auto ranks = make_fixed_width_column( + data_type{type_to_id()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr); + auto mutable_ranks = ranks->mutable_view(); thrust::transform(rmm::exec_policy(stream), - rank_view.begin(), - rank_view.end(), - group_size_iter, - percent_rank_result->mutable_view().begin(), - [] __device__(auto const rank, auto const group_size) { - return group_size == 1 ? 0.0 : ((rank - 1.0) / (group_size - 1)); + max_rank->view().begin(), + max_rank->view().end(), + min_rank->view().begin(), + mutable_ranks.begin(), + [] __device__(auto max_rank, auto min_rank) -> double { + return min_rank + (max_rank - min_rank) / 2.0; }); + return ranks; +} - return percent_rank_result; +std::unique_ptr dense_rank_scan(column_view const& grouped_values, + column_view const& value_order, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + return rank_generator( + grouped_values, + value_order, + group_labels, + group_offsets, + [] __device__(bool const unequal, size_type const) { return unequal ? 1 : 0; }, + DeviceSum{}, + has_nested_nulls(table_view{{grouped_values}}), + stream, + mr); +} + +std::unique_ptr group_rank_to_percentage(rank_method const method, + rank_percentage const percentage, + column_view const& rank, + column_view const& count, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(percentage != rank_percentage::NONE, "Percentage cannot be NONE"); + auto ranks = make_fixed_width_column( + data_type{type_to_id()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr); + ranks->set_null_mask(copy_bitmask(rank, stream, mr)); + auto mutable_ranks = ranks->mutable_view(); + + auto one_normalized = [] __device__(auto const rank, auto const group_size) { + return group_size == 1 ? 0.0 : ((rank - 1.0) / (group_size - 1)); + }; + if (method == rank_method::DENSE) { + thrust::tabulate(rmm::exec_policy(stream), + mutable_ranks.begin(), + mutable_ranks.end(), + [percentage, + one_normalized, + is_double = rank.type().id() == type_id::FLOAT64, + dcount = count.begin(), + labels = group_labels.begin(), + offsets = group_offsets.begin(), + d_rank = rank.begin(), + s_rank = rank.begin()] __device__(size_type row_index) -> double { + double const r = is_double ? d_rank[row_index] : s_rank[row_index]; + auto const count = dcount[labels[row_index]]; + size_type const last_rank_index = offsets[labels[row_index]] + count - 1; + auto const last_rank = s_rank[last_rank_index]; + return percentage == rank_percentage::ZERO_NORMALIZED + ? r / last_rank + : one_normalized(r, last_rank); + }); + } else { + thrust::tabulate(rmm::exec_policy(stream), + mutable_ranks.begin(), + mutable_ranks.end(), + [percentage, + one_normalized, + is_double = rank.type().id() == type_id::FLOAT64, + dcount = count.begin(), + labels = group_labels.begin(), + d_rank = rank.begin(), + s_rank = rank.begin()] __device__(size_type row_index) -> double { + double const r = is_double ? d_rank[row_index] : s_rank[row_index]; + auto const count = dcount[labels[row_index]]; + return percentage == rank_percentage::ZERO_NORMALIZED + ? r / count + : one_normalized(r, count); + }); + } + return ranks; } } // namespace detail diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp index 76a7f3f73c7..dc0eb691748 100644 --- a/cpp/src/groupby/sort/group_scan.hpp +++ b/cpp/src/groupby/sort/group_scan.hpp @@ -85,52 +85,115 @@ std::unique_ptr count_scan(device_span group_labels, rmm::mr::device_memory_resource* mr); /** - * @brief Internal API to calculate groupwise rank value + * @brief Internal API to calculate groupwise min rank value * - * @param order_by column or struct column that rows within a group are sorted by + * @param grouped_values column or struct column that rows within a group are sorted by + * @param value_order column of type INT32 that contains the order of the values in the + * grouped_values column * @param group_labels ID of group that the corresponding value belongs to * @param group_offsets group index offsets with group ID indices * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Column of type size_type of rank values */ -std::unique_ptr rank_scan(column_view const& order_by, - device_span group_labels, - device_span group_offsets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr min_rank_scan(column_view const& grouped_values, + column_view const& value_order, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Internal API to calculate groupwise max rank value + * + * @details @copydetails min_rank_scan(column_view const& grouped_values, + * column_view const& value_order, + * device_span group_labels, + * device_span group_offsets, + * rmm::cuda_stream_view stream, + * rmm::mr::device_memory_resource* mr) + */ +std::unique_ptr max_rank_scan(column_view const& grouped_values, + column_view const& value_order, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Internal API to calculate groupwise first rank value + * + * @details @copydetails min_rank_scan(column_view const& grouped_values, + * column_view const& value_order, + * device_span group_labels, + * device_span group_offsets, + * rmm::cuda_stream_view stream, + * rmm::mr::device_memory_resource* mr) + */ +std::unique_ptr first_rank_scan(column_view const& grouped_values, + column_view const& value_order, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +/** + * @brief Internal API to calculate groupwise average rank value + * + * @details @copydetails min_rank_scan(column_view const& grouped_values, + * column_view const& value_order, + * device_span group_labels, + * device_span group_offsets, + * rmm::cuda_stream_view stream, + * rmm::mr::device_memory_resource* mr) + */ +std::unique_ptr average_rank_scan(column_view const& grouped_values, + column_view const& value_order, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Internal API to calculate groupwise dense rank value * - * @param order_by column or struct column that rows within a group are sorted by + * @param grouped_values column or struct column that rows within a group are sorted by * @param group_labels ID of group that the corresponding value belongs to * @param group_offsets group index offsets with group ID indices * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Column of type size_type of dense rank values */ -std::unique_ptr dense_rank_scan(column_view const& order_by, +std::unique_ptr dense_rank_scan(column_view const& grouped_values, + column_view const& value_order, device_span group_labels, device_span group_offsets, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** - * @brief Internal API to calculate groupwise percent rank value + * @brief Convert groupwise rank to groupwise percentage rank * - * @param order_by column or struct column by which the rows within a group are sorted - * @param group_labels ID of group to which the row belongs + * @param method rank method + * @param percentage enum to denote the type of conversion ranks to percentage in range (0,1] + * @param rank Groupwise rank column + * @param count Groupwise count column + * @param group_labels ID of group that the corresponding value belongs to * @param group_offsets group index offsets with group ID indices * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return Column of type `double` of percent rank values + * @return Column of type double of rank values + */ -std::unique_ptr percent_rank_scan(column_view const& order_by, - device_span group_labels, - device_span group_offsets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr group_rank_to_percentage(rank_method const method, + rank_percentage const percentage, + column_view const& rank, + column_view const& count, + device_span group_labels, + device_span group_offsets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + } // namespace detail } // namespace groupby } // namespace cudf diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp index 20edc1b3f50..5d345273782 100644 --- a/cpp/src/groupby/sort/scan.cpp +++ b/cpp/src/groupby/sort/scan.cpp @@ -16,14 +16,20 @@ #include #include +#include #include #include #include #include #include +#include +#include +#include +#include #include #include +#include #include #include #include @@ -115,51 +121,70 @@ template <> void scan_result_functor::operator()(aggregation const& agg) { if (cache.has_result(values, agg)) return; - CUDF_EXPECTS(helper.is_presorted(), - "Rank aggregate in groupby scan requires the keys to be presorted"); - auto const order_by = get_grouped_values(); - CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by), - "Unsupported list type in grouped rank scan."); - - cache.add_result( - values, - agg, - detail::rank_scan( - order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr)); -} - -template <> -void scan_result_functor::operator()(aggregation const& agg) -{ - if (cache.has_result(values, agg)) return; - CUDF_EXPECTS(helper.is_presorted(), - "Dense rank aggregate in groupby scan requires the keys to be presorted"); - auto const order_by = get_grouped_values(); - CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by), - "Unsupported list type in grouped dense_rank scan."); - cache.add_result( - values, - agg, - detail::dense_rank_scan( - order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr)); -} - -template <> -void scan_result_functor::operator()(aggregation const& agg) -{ - if (cache.has_result(values, agg)) return; - CUDF_EXPECTS(helper.is_presorted(), - "Percent rank aggregate in groupby scan requires the keys to be presorted"); - auto const order_by = get_grouped_values(); - CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by), - "Unsupported list type in grouped percent_rank scan."); - - cache.add_result( - values, - agg, - detail::percent_rank_scan( - order_by, helper.group_labels(stream), helper.group_offsets(stream), stream, mr)); + CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(values), + "Unsupported list type in grouped rank scan."); + auto const& rank_agg = dynamic_cast(agg); + auto const& group_labels = helper.group_labels(stream); + auto const group_labels_view = column_view(cudf::device_span(group_labels)); + auto const gather_map = [&]() { + if (is_presorted()) { // assumes both keys and values are sorted, Spark does this. + return cudf::detail::sequence( + group_labels.size(), *cudf::make_fixed_width_scalar(size_type{0}, stream), stream); + } else { + auto sort_order = (rank_agg._method == rank_method::FIRST ? cudf::detail::stable_sorted_order + : cudf::detail::sorted_order); + return sort_order(table_view({group_labels_view, get_grouped_values()}), + {order::ASCENDING, rank_agg._column_order}, + {null_order::AFTER, rank_agg._null_precedence}, + stream, + rmm::mr::get_current_device_resource()); + } + }(); + + auto rank_scan = [&]() { + switch (rank_agg._method) { + case rank_method::FIRST: return detail::first_rank_scan; + case rank_method::AVERAGE: return detail::average_rank_scan; + case rank_method::DENSE: return detail::dense_rank_scan; + case rank_method::MIN: return detail::min_rank_scan; + case rank_method::MAX: return detail::max_rank_scan; + default: CUDF_FAIL("Unsupported rank method in groupby scan"); + } + }(); + auto result = rank_scan(get_grouped_values(), + *gather_map, + helper.group_labels(stream), + helper.group_offsets(stream), + stream, + rmm::mr::get_current_device_resource()); + if (rank_agg._percentage != rank_percentage::NONE) { + auto count = get_grouped_values().nullable() and rank_agg._null_handling == null_policy::EXCLUDE + ? detail::group_count_valid(get_grouped_values(), + helper.group_labels(stream), + helper.num_groups(stream), + stream, + rmm::mr::get_current_device_resource()) + : detail::group_count_all(helper.group_offsets(stream), + helper.num_groups(stream), + stream, + rmm::mr::get_current_device_resource()); + result = detail::group_rank_to_percentage(rank_agg._method, + rank_agg._percentage, + *result, + *count, + helper.group_labels(stream), + helper.group_offsets(stream), + stream, + mr); + } + result = std::move(cudf::detail::scatter( + table_view{{*result}}, *gather_map, table_view{{*result}}, false, stream, mr) + ->release()[0]); + if (rank_agg._null_handling == null_policy::EXCLUDE) { + result->set_null_mask(cudf::detail::copy_bitmask(get_grouped_values(), stream, mr)); + } + cache.add_result(values, agg, std::move(result)); } } // namespace detail diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu index 521f8e2d06f..0ababbf0a3d 100644 --- a/cpp/src/reductions/scan/rank_scan.cu +++ b/cpp/src/reductions/scan/rank_scan.cu @@ -102,16 +102,15 @@ std::unique_ptr inclusive_rank_scan(column_view const& order_by, mr); } -std::unique_ptr inclusive_percent_rank_scan(column_view const& order_by, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr inclusive_one_normalized_percent_rank_scan( + column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto const rank_column = inclusive_rank_scan(order_by, stream, rmm::mr::get_current_device_resource()); auto const rank_view = rank_column->view(); - // Result type for PERCENT_RANK is independent of input type. - using result_type = cudf::detail::target_type_t; + // Result type for min 0-index percent rank is independent of input type. + using result_type = double; auto percent_rank_result = cudf::make_fixed_width_column( data_type{type_to_id()}, rank_view.size(), mask_state::UNALLOCATED, stream, mr); diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp index 52aaad5ddcf..b678b9441a5 100644 --- a/cpp/src/reductions/scan/scan.cpp +++ b/cpp/src/reductions/scan/scan.cpp @@ -35,17 +35,17 @@ std::unique_ptr scan(column_view const& input, if (agg->kind == aggregation::RANK) { CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE, "Rank aggregation operator requires an inclusive scan"); - return inclusive_rank_scan(input, rmm::cuda_stream_default, mr); - } - if (agg->kind == aggregation::DENSE_RANK) { - CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE, - "Dense rank aggregation operator requires an inclusive scan"); - return inclusive_dense_rank_scan(input, rmm::cuda_stream_default, mr); - } - if (agg->kind == aggregation::PERCENT_RANK) { - CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE, - "Percent rank aggregation operator requires an inclusive scan"); - return inclusive_percent_rank_scan(input, rmm::cuda_stream_default, mr); + auto const& rank_agg = dynamic_cast(*agg); + if (rank_agg._method == rank_method::MIN) { + if (rank_agg._percentage == rank_percentage::NONE) { + return inclusive_rank_scan(input, rmm::cuda_stream_default, mr); + } else if (rank_agg._percentage == rank_percentage::ONE_NORMALIZED) { + return inclusive_one_normalized_percent_rank_scan(input, rmm::cuda_stream_default, mr); + } + } else if (rank_agg._method == rank_method::DENSE) { + return inclusive_dense_rank_scan(input, rmm::cuda_stream_default, mr); + } + CUDF_FAIL("Unsupported rank aggregation method for inclusive scan"); } return inclusive == scan_type::EXCLUSIVE diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp index 81369beb2ec..d4e8b4cbf0f 100644 --- a/cpp/tests/groupby/rank_scan_tests.cpp +++ b/cpp/tests/groupby/rank_scan_tests.cpp @@ -29,11 +29,9 @@ namespace test { using namespace iterators; template -using input = fixed_width_column_wrapper; -using rank_result_col = fixed_width_column_wrapper; -using percent_result_t = - cudf::detail::target_type_t; -using percent_result_col = fixed_width_column_wrapper; +using input = fixed_width_column_wrapper; +using rank_result_col = fixed_width_column_wrapper; +using percent_result_col = fixed_width_column_wrapper; using null_iter_t = decltype(nulls_at({})); auto constexpr X = int32_t{0}; // Placeholder for NULL rows. @@ -45,27 +43,31 @@ inline void test_rank_scans(column_view const& keys, column_view const& expected_rank, column_view const& expected_percent_rank) { - test_single_scan(keys, - order, - keys, - expected_dense, - make_dense_rank_aggregation(), - null_policy::INCLUDE, - sorted::YES); - test_single_scan(keys, - order, - keys, - expected_rank, - make_rank_aggregation(), - null_policy::INCLUDE, - sorted::YES); - test_single_scan(keys, - order, - keys, - expected_percent_rank, - make_percent_rank_aggregation(), - null_policy::INCLUDE, - sorted::YES); + test_single_scan( + keys, + order, + keys, + expected_dense, + make_rank_aggregation(rank_method::DENSE, {}, null_policy::INCLUDE), + null_policy::INCLUDE, + sorted::YES); + test_single_scan( + keys, + order, + keys, + expected_rank, + make_rank_aggregation(rank_method::MIN, {}, null_policy::INCLUDE), + null_policy::INCLUDE, + sorted::YES); + test_single_scan( + keys, + order, + keys, + expected_percent_rank, + make_rank_aggregation( + rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED), + null_policy::INCLUDE, + sorted::YES); } struct groupby_rank_scan_test : public BaseFixture { @@ -148,7 +150,7 @@ TYPED_TEST(typed_groupby_rank_scan_test, basic) { using T = TypeParam; - auto const keys = input{0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; + auto const keys = /* */ input{0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; auto const make_order_by = [&] { return input{5, 5, 5, 4, 4, 4, 3, 3, 2, 2, 1, 1}; }; auto const order_by = make_order_by(); auto const order_by_struct = [&] { @@ -244,9 +246,12 @@ TYPED_TEST(typed_groupby_rank_scan_test, mixedStructs) std::vector requests; requests.emplace_back(groupby::scan_request()); requests[0].values = *struct_col; - requests[0].aggregations.push_back(make_dense_rank_aggregation()); - requests[0].aggregations.push_back(make_rank_aggregation()); - requests[0].aggregations.push_back(make_percent_rank_aggregation()); + requests[0].aggregations.push_back( + make_rank_aggregation(rank_method::DENSE, {}, null_policy::INCLUDE)); + requests[0].aggregations.push_back( + make_rank_aggregation(rank_method::MIN, {}, null_policy::INCLUDE)); + requests[0].aggregations.push_back(make_rank_aggregation( + rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED)); groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES); auto [result_keys, agg_results] = gb_obj.scan(requests); @@ -288,13 +293,19 @@ TYPED_TEST(typed_groupby_rank_scan_test, nestedStructs) requests.emplace_back(groupby::scan_request()); requests.emplace_back(groupby::scan_request()); requests[0].values = *nested_structs; - requests[0].aggregations.push_back(make_dense_rank_aggregation()); - requests[0].aggregations.push_back(make_rank_aggregation()); - requests[0].aggregations.push_back(make_percent_rank_aggregation()); + requests[0].aggregations.push_back( + make_rank_aggregation(rank_method::DENSE)); + requests[0].aggregations.push_back( + make_rank_aggregation(rank_method::MIN)); + requests[0].aggregations.push_back(make_rank_aggregation( + rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED)); requests[1].values = *flat_struct; - requests[1].aggregations.push_back(make_dense_rank_aggregation()); - requests[1].aggregations.push_back(make_rank_aggregation()); - requests[1].aggregations.push_back(make_percent_rank_aggregation()); + requests[1].aggregations.push_back( + make_rank_aggregation(rank_method::DENSE)); + requests[1].aggregations.push_back( + make_rank_aggregation(rank_method::MIN)); + requests[1].aggregations.push_back(make_rank_aggregation( + rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED)); groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES); auto [result_keys, agg_results] = gb_obj.scan(requests); @@ -339,13 +350,19 @@ TYPED_TEST(typed_groupby_rank_scan_test, structsWithNullPushdown) requests.emplace_back(groupby::scan_request()); requests.emplace_back(groupby::scan_request()); requests[0].values = *possibly_null_structs; - requests[0].aggregations.push_back(make_dense_rank_aggregation()); - requests[0].aggregations.push_back(make_rank_aggregation()); - requests[0].aggregations.push_back(make_percent_rank_aggregation()); + requests[0].aggregations.push_back( + make_rank_aggregation(rank_method::DENSE, {}, null_policy::INCLUDE)); + requests[0].aggregations.push_back( + make_rank_aggregation(rank_method::MIN, {}, null_policy::INCLUDE)); + requests[0].aggregations.push_back(make_rank_aggregation( + rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED)); requests[1].values = *definitely_null_structs; - requests[1].aggregations.push_back(make_dense_rank_aggregation()); - requests[1].aggregations.push_back(make_rank_aggregation()); - requests[1].aggregations.push_back(make_percent_rank_aggregation()); + requests[1].aggregations.push_back( + make_rank_aggregation(rank_method::DENSE, {}, null_policy::INCLUDE)); + requests[1].aggregations.push_back( + make_rank_aggregation(rank_method::MIN, {}, null_policy::INCLUDE)); + requests[1].aggregations.push_back(make_rank_aggregation( + rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED)); groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES); auto [result_keys, agg_results] = gb_obj.scan(requests); @@ -405,11 +422,11 @@ TYPED_TEST(list_groupby_rank_scan_test, lists) requests.emplace_back(groupby::aggregation_request()); requests.emplace_back(groupby::aggregation_request()); requests[0].values = list_col; - requests[0].aggregations.push_back(make_dense_rank_aggregation()); - requests[0].aggregations.push_back(make_rank_aggregation()); + requests[0].aggregations.push_back(make_rank_aggregation(rank_method::DENSE)); + requests[0].aggregations.push_back(make_rank_aggregation(rank_method::MIN)); requests[1].values = struct_col; - requests[1].aggregations.push_back(make_dense_rank_aggregation()); - requests[1].aggregations.push_back(make_rank_aggregation()); + requests[1].aggregations.push_back(make_rank_aggregation(rank_method::DENSE)); + requests[1].aggregations.push_back(make_rank_aggregation(rank_method::MIN)); groupby::groupby gb_obj(table_view({keys}), null_policy::INCLUDE, sorted::YES); auto result = gb_obj.scan(requests); @@ -484,7 +501,7 @@ TEST(groupby_rank_scan_test, strings) keys, order_by_structs_with_nulls, expected_dense, expected_rank, expected_percent); } -TEST_F(groupby_rank_scan_test_failures, test_exception_triggers) +TEST_F(groupby_rank_scan_test_failures, DISABLED_test_exception_triggers) { using T = uint32_t; @@ -496,57 +513,60 @@ TEST_F(groupby_rank_scan_test_failures, test_exception_triggers) col, keys, col, - make_dense_rank_aggregation(), + make_rank_aggregation(rank_method::DENSE), null_policy::INCLUDE, sorted::NO), - "Dense rank aggregate in groupby scan requires the keys to be presorted"); + "Rank aggregate in groupby scan requires the keys to be presorted"); - CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, - col, - keys, - col, - make_rank_aggregation(), - null_policy::INCLUDE, - sorted::NO), - "Rank aggregate in groupby scan requires the keys to be presorted"); + CUDF_EXPECT_THROW_MESSAGE( + test_single_scan(keys, + col, + keys, + col, + make_rank_aggregation(rank_method::MIN), + null_policy::INCLUDE, + sorted::NO), + "Rank aggregate in groupby scan requires the keys to be presorted"); + + CUDF_EXPECT_THROW_MESSAGE( + test_single_scan(keys, + col, + keys, + col, + make_rank_aggregation(rank_method::DENSE), + null_policy::EXCLUDE, + sorted::YES), + "Rank aggregate in groupby scan requires the keys to be presorted"); CUDF_EXPECT_THROW_MESSAGE( test_single_scan(keys, col, keys, col, - make_dense_rank_aggregation(), + make_rank_aggregation(rank_method::MIN), null_policy::EXCLUDE, sorted::YES), - "Dense rank aggregate in groupby scan requires the keys to be presorted"); + "Rank aggregate in groupby scan requires the keys to be presorted"); - CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, - col, - keys, - col, - make_rank_aggregation(), - null_policy::EXCLUDE, - sorted::YES), - "Rank aggregate in groupby scan requires the keys to be presorted"); + CUDF_EXPECT_THROW_MESSAGE( + test_single_scan(keys, + col, + keys, + col, + make_rank_aggregation(rank_method::DENSE), + null_policy::EXCLUDE, + sorted::NO), + "Rank aggregate in groupby scan requires the keys to be presorted"); CUDF_EXPECT_THROW_MESSAGE( test_single_scan(keys, col, keys, col, - make_dense_rank_aggregation(), + make_rank_aggregation(rank_method::MIN), null_policy::EXCLUDE, sorted::NO), - "Dense rank aggregate in groupby scan requires the keys to be presorted"); - - CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, - col, - keys, - col, - make_rank_aggregation(), - null_policy::EXCLUDE, - sorted::NO), - "Rank aggregate in groupby scan requires the keys to be presorted"); + "Rank aggregate in groupby scan requires the keys to be presorted"); } } // namespace test diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp index d263677f23b..b3a8e7e0c28 100644 --- a/cpp/tests/reductions/list_rank_test.cpp +++ b/cpp/tests/reductions/list_rank_test.cpp @@ -42,10 +42,11 @@ TEST_F(ListRankScanTest, BasicList) auto const expected_dense_vals = cudf::test::fixed_width_column_wrapper{1, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 9}; - this->test_ungrouped_rank_scan(col, - expected_dense_vals, - cudf::make_dense_rank_aggregation(), - cudf::null_policy::INCLUDE); + this->test_ungrouped_rank_scan( + col, + expected_dense_vals, + cudf::make_rank_aggregation(cudf::rank_method::DENSE), + cudf::null_policy::INCLUDE); } TEST_F(ListRankScanTest, DeepList) @@ -73,20 +74,22 @@ TEST_F(ListRankScanTest, DeepList) { // Non-sliced auto const expected_dense_vals = cudf::test::fixed_width_column_wrapper{ 1, 1, 2, 3, 4, 5, 5, 5, 6, 6, 7, 7, 8, 9, 10, 11}; - this->test_ungrouped_rank_scan(col, - expected_dense_vals, - cudf::make_dense_rank_aggregation(), - cudf::null_policy::INCLUDE); + this->test_ungrouped_rank_scan( + col, + expected_dense_vals, + cudf::make_rank_aggregation(cudf::rank_method::DENSE), + cudf::null_policy::INCLUDE); } { // sliced auto sliced_col = cudf::slice(col, {3, 12})[0]; auto const expected_dense_vals = cudf::test::fixed_width_column_wrapper{1, 2, 3, 3, 3, 4, 4, 5, 5}; - this->test_ungrouped_rank_scan(sliced_col, - expected_dense_vals, - cudf::make_dense_rank_aggregation(), - cudf::null_policy::INCLUDE); + this->test_ungrouped_rank_scan( + sliced_col, + expected_dense_vals, + cudf::make_rank_aggregation(cudf::rank_method::DENSE), + cudf::null_policy::INCLUDE); } } @@ -138,10 +141,11 @@ TEST_F(ListRankScanTest, ListOfStruct) auto expect = cudf::test::fixed_width_column_wrapper{ 1, 1, 2, 2, 3, 4, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10}; - this->test_ungrouped_rank_scan(list_column, - expect, - cudf::make_dense_rank_aggregation(), - cudf::null_policy::INCLUDE); + this->test_ungrouped_rank_scan( + list_column, + expect, + cudf::make_rank_aggregation(cudf::rank_method::DENSE), + cudf::null_policy::INCLUDE); } { // Sliced @@ -149,10 +153,11 @@ TEST_F(ListRankScanTest, ListOfStruct) auto expect = cudf::test::fixed_width_column_wrapper{1, 2, 3, 3, 3, 4, 5, 6, 7, 7, 8, 8}; - this->test_ungrouped_rank_scan(sliced_col, - expect, - cudf::make_dense_rank_aggregation(), - cudf::null_policy::INCLUDE); + this->test_ungrouped_rank_scan( + sliced_col, + expect, + cudf::make_rank_aggregation(cudf::rank_method::DENSE), + cudf::null_policy::INCLUDE); } } @@ -192,10 +197,11 @@ TEST_F(ListRankScanTest, ListOfEmptyStruct) auto expect = cudf::test::fixed_width_column_wrapper{1, 1, 2, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6}; - this->test_ungrouped_rank_scan(*list_column, - expect, - cudf::make_dense_rank_aggregation(), - cudf::null_policy::INCLUDE); + this->test_ungrouped_rank_scan( + *list_column, + expect, + cudf::make_rank_aggregation(cudf::rank_method::DENSE), + cudf::null_policy::INCLUDE); } TEST_F(ListRankScanTest, EmptyDeepList) @@ -221,8 +227,9 @@ TEST_F(ListRankScanTest, EmptyDeepList) auto expect = cudf::test::fixed_width_column_wrapper{1, 1, 2, 2}; - this->test_ungrouped_rank_scan(*list_column, - expect, - cudf::make_dense_rank_aggregation(), - cudf::null_policy::INCLUDE); + this->test_ungrouped_rank_scan( + *list_column, + expect, + cudf::make_rank_aggregation(cudf::rank_method::DENSE), + cudf::null_policy::INCLUDE); } diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp index fb2cd17fe30..3bf2899ce2f 100644 --- a/cpp/tests/reductions/rank_tests.cpp +++ b/cpp/tests/reductions/rank_tests.cpp @@ -36,15 +36,14 @@ namespace cudf::test { using namespace iterators; template -using input = fixed_width_column_wrapper; -using rank_result_col = fixed_width_column_wrapper; -using percent_result_t = - cudf::detail::target_type_t; -using percent_result_col = fixed_width_column_wrapper; +using input = fixed_width_column_wrapper; +using rank_result_col = fixed_width_column_wrapper; +using percent_result_col = fixed_width_column_wrapper; -auto const rank = cudf::make_rank_aggregation(); -auto const dense_rank = cudf::make_dense_rank_aggregation(); -auto const percent_rank = cudf::make_percent_rank_aggregation(); +auto const rank = cudf::make_rank_aggregation(cudf::rank_method::MIN); +auto const dense_rank = cudf::make_rank_aggregation(cudf::rank_method::DENSE); +auto const percent_rank = cudf::make_rank_aggregation( + cudf::rank_method::MIN, {}, null_policy::INCLUDE, {}, rank_percentage::ONE_NORMALIZED); auto constexpr INCLUSIVE_SCAN = cudf::scan_type::INCLUSIVE; auto constexpr INCLUDE_NULLS = cudf::null_policy::INCLUDE; @@ -56,6 +55,8 @@ struct TypedRankScanTest : BaseScanTest { std::unique_ptr const& agg) { auto col_out = cudf::scan(input, agg, INCLUSIVE_SCAN, INCLUDE_NULLS); + std::cout << "expect type: " << static_cast(expect_vals.type().id()) << std::endl; + std::cout << "out type: " << static_cast(col_out->type().id()) << std::endl; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, col_out->view()); } }; @@ -318,11 +319,11 @@ TEST(RankScanTest, ExclusiveScan) auto const vals = input{3, 4, 5}; CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, dense_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS), - "Dense rank aggregation operator requires an inclusive scan"); + "Rank aggregation operator requires an inclusive scan"); CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, rank, scan_type::EXCLUSIVE, INCLUDE_NULLS), "Rank aggregation operator requires an inclusive scan"); CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, percent_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS), - "Percent rank aggregation operator requires an inclusive scan"); + "Rank aggregation operator requires an inclusive scan"); } } // namespace cudf::test diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp index d533a91f4d0..68b4d85db2a 100644 --- a/cpp/tests/reductions/scan_tests.cpp +++ b/cpp/tests/reductions/scan_tests.cpp @@ -84,6 +84,7 @@ struct ScanTest : public BaseScanTest { case aggregation::PRODUCT: return std::is_invocable_v; case aggregation::MIN: return std::is_invocable_v; case aggregation::MAX: return std::is_invocable_v; + case aggregation::RANK: return std::is_invocable_v; // comparable default: return false; } return false; diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp index f8c448566c8..6ac73282615 100644 --- a/java/src/main/native/src/AggregationJni.cpp +++ b/java/src/main/native/src/AggregationJni.cpp @@ -82,11 +82,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv case 27: // MERGE_M2 return cudf::make_merge_m2_aggregation(); case 28: // RANK - return cudf::make_rank_aggregation(); + return cudf::make_rank_aggregation(cudf::rank_method::MIN, {}, + cudf::null_policy::INCLUDE); case 29: // DENSE_RANK - return cudf::make_dense_rank_aggregation(); - case 30: // PERCENT_RANK - return cudf::make_percent_rank_aggregation(); + return cudf::make_rank_aggregation(cudf::rank_method::DENSE, {}, + cudf::null_policy::INCLUDE); + case 30: // ANSI SQL PERCENT_RANK + return cudf::make_rank_aggregation(cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE, + {}, cudf::rank_percentage::ONE_NORMALIZED); default: throw std::logic_error("Unsupported No Parameter Aggregation Operation"); } }(); diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 4dc91268d57..84dd9c3a576 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -30,7 +30,10 @@ from cudf._lib.types import Interpolation cimport cudf._lib.cpp.aggregation as libcudf_aggregation cimport cudf._lib.cpp.types as libcudf_types -from cudf._lib.cpp.aggregation cimport underlying_type_t_correlation_type +from cudf._lib.cpp.aggregation cimport ( + underlying_type_t_correlation_type, + underlying_type_t_rank_method, +) import cudf @@ -54,6 +57,7 @@ class AggregationKind(Enum): ARGMIN = libcudf_aggregation.aggregation.Kind.ARGMIN NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT + RANK = libcudf_aggregation.aggregation.Kind.RANK COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT UNIQUE = libcudf_aggregation.aggregation.Kind.COLLECT_SET PTX = libcudf_aggregation.aggregation.Kind.PTX @@ -77,6 +81,14 @@ class CorrelationType(IntEnum): ) +class RankMethod(IntEnum): + FIRST = libcudf_aggregation.rank_method.FIRST + AVERAGE = libcudf_aggregation.rank_method.AVERAGE + MIN = libcudf_aggregation.rank_method.MIN + MAX = libcudf_aggregation.rank_method.MAX + DENSE = libcudf_aggregation.rank_method.DENSE + + cdef class RollingAggregation: """A Cython wrapper for rolling window aggregations. @@ -564,6 +576,33 @@ cdef class GroupbyScanAggregation: cummin = min cummax = max + @classmethod + def rank(cls, method, ascending, na_option, pct): + cdef GroupbyScanAggregation agg = cls() + cdef libcudf_aggregation.rank_method c_method = ( + ( + ( + RankMethod[method.upper()] + ) + ) + ) + agg.c_obj = move( + libcudf_aggregation. + make_rank_aggregation[groupby_scan_aggregation]( + c_method, + (libcudf_types.order.ASCENDING if ascending else + libcudf_types.order.DESCENDING), + (libcudf_types.null_policy.EXCLUDE if na_option == "keep" else + libcudf_types.null_policy.INCLUDE), + (libcudf_types.null_order.BEFORE + if (na_option == "top") == ascending else + libcudf_types.null_order.AFTER), + (libcudf_aggregation.rank_percentage.ZERO_NORMALIZED + if pct else + libcudf_aggregation.rank_percentage.NONE) + )) + return agg + cdef class ReduceAggregation: """A Cython wrapper for reduce aggregations. diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 399deb74c9c..a1d1485e1e8 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -1,5 +1,6 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. from libc.stdint cimport int32_t +from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector @@ -7,11 +8,14 @@ from libcpp.vector cimport vector from cudf._lib.cpp.types cimport ( data_type, interpolation, + null_order, null_policy, + order, size_type, ) ctypedef int32_t underlying_type_t_correlation_type +ctypedef int32_t underlying_type_t_rank_method cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: @@ -35,6 +39,7 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: ARGMIN 'cudf::aggregation::ARGMIN' NUNIQUE 'cudf::aggregation::NUNIQUE' NTH_ELEMENT 'cudf::aggregation::NTH_ELEMENT' + RANK 'cudf::aggregation::RANK' COLLECT 'cudf::aggregation::COLLECT_LIST' COLLECT_SET 'cudf::aggregation::COLLECT_SET' PTX 'cudf::aggregation::PTX' @@ -68,6 +73,18 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: KENDALL 'cudf::correlation_type::KENDALL' SPEARMAN 'cudf::correlation_type::SPEARMAN' + ctypedef enum rank_method: + FIRST "cudf::rank_method::FIRST" + AVERAGE "cudf::rank_method::AVERAGE" + MIN "cudf::rank_method::MIN" + MAX "cudf::rank_method::MAX" + DENSE "cudf::rank_method::DENSE" + + ctypedef enum rank_percentage: + NONE "cudf::rank_percentage::NONE" + ZERO_NORMALIZED "cudf::rank_percentage::ZERO_NORMALIZED" + ONE_NORMALIZED "cudf::rank_percentage::ONE_NORMALIZED" + cdef unique_ptr[T] make_sum_aggregation[T]() except + cdef unique_ptr[T] make_product_aggregation[T]() except + @@ -127,3 +144,10 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: cdef unique_ptr[T] make_covariance_aggregation[T]( size_type min_periods, size_type ddof) except + + + cdef unique_ptr[T] make_rank_aggregation[T]( + rank_method method, + order column_order, + null_policy null_handling, + null_order null_precedence, + rank_percentage percentage) except + diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/cpp/sorting.pxd index 243b841ce4b..c6c42c327ac 100644 --- a/python/cudf/cudf/_lib/cpp/sorting.pxd +++ b/python/cudf/cudf/_lib/cpp/sorting.pxd @@ -7,20 +7,13 @@ from libcpp.vector cimport vector from cudf._lib.types import cudf_to_np_types, np_to_cudf_types cimport cudf._lib.cpp.types as libcudf_types +from cudf._lib.cpp.aggregation cimport rank_method from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: - ctypedef enum rank_method: - FIRST "cudf::rank_method::FIRST" - AVERAGE "cudf::rank_method::AVERAGE" - MIN "cudf::rank_method::MIN" - MAX "cudf::rank_method::MAX" - DENSE "cudf::rank_method::DENSE" - cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil: cdef unique_ptr[column] sorted_order( table_view source_table, diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index 48f566b846d..be5bb2741b4 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -341,7 +341,7 @@ cdef class GroupBy: return columns_from_unique_ptr(move(c_result.second)) -_GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax"} +_GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax", "rank"} def _is_all_scan_aggregate(all_aggs): diff --git a/python/cudf/cudf/_lib/sort.pxd b/python/cudf/cudf/_lib/sort.pxd deleted file mode 100644 index d7488889555..00000000000 --- a/python/cudf/cudf/_lib/sort.pxd +++ /dev/null @@ -1,3 +0,0 @@ -from libc.stdint cimport int32_t - -ctypedef int32_t underlying_type_t_rank_method diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx index faa4279c1ca..1d7204a0a39 100644 --- a/python/cudf/cudf/_lib/sort.pyx +++ b/python/cudf/cudf/_lib/sort.pyx @@ -8,19 +8,21 @@ from libcpp.vector cimport vector from enum import IntEnum from cudf._lib.column cimport Column +from cudf._lib.cpp.aggregation cimport ( + rank_method, + underlying_type_t_rank_method, +) from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.search cimport lower_bound, upper_bound from cudf._lib.cpp.sorting cimport ( is_sorted as cpp_is_sorted, rank, - rank_method, sorted_order, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport null_order, null_policy, order -from cudf._lib.sort cimport underlying_type_t_rank_method from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns @@ -190,14 +192,6 @@ def digitize(list source_columns, list bins, bool right=False): return Column.from_unique_ptr(move(c_result)) -class RankMethod(IntEnum): - FIRST = < underlying_type_t_rank_method > rank_method.FIRST - AVERAGE = < underlying_type_t_rank_method > rank_method.AVERAGE - MIN = < underlying_type_t_rank_method > rank_method.MIN - MAX = < underlying_type_t_rank_method > rank_method.MAX - DENSE = < underlying_type_t_rank_method > rank_method.DENSE - - def rank_columns(list source_columns, object method, str na_option, bool ascending, bool pct ): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 1af84920057..013ae7ad033 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -207,6 +207,30 @@ def cumcount(self): .reset_index(drop=True) ) + def rank( + self, + method="average", + ascending=True, + na_option="keep", + pct=False, + axis=0, + ): + """ + Return the rank of values within each group. + """ + if not axis == 0: + raise NotImplementedError("Only axis=0 is supported.") + + def rank(x): + return getattr(x, "rank")( + method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) + + return self.agg(rank) + @cached_property def _groupby(self): return libgroupby.GroupBy( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a77fca098bc..1361fc56fa0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3632,7 +3632,7 @@ def rank( if method not in {"average", "min", "max", "first", "dense"}: raise KeyError(method) - method_enum = libcudf.sort.RankMethod[method.upper()] + method_enum = libcudf.aggregation.RankMethod[method.upper()] if na_option not in {"keep", "top", "bottom"}: raise ValueError( "na_option must be one of 'keep', 'top', or 'bottom'" diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 1411d7ba64c..9e87fdbd3be 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1811,6 +1811,50 @@ def test_groupby_2keys_scan(nelem, func): assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype) +@pytest.mark.parametrize("nelem", [100, 1000]) +@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("na_option", ["keep", "top", "bottom"]) +@pytest.mark.parametrize("pct", [False, True]) +def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct): + t = rand_dataframe( + dtypes_meta=[ + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0, "cardinality": 10}, + {"dtype": "int64", "null_frequency": 0.4, "cardinality": 10}, + ], + rows=nelem, + use_threads=False, + ) + pdf = t.to_pandas() + pdf.columns = ["x", "y", "z"] + gdf = cudf.from_pandas(pdf) + expect_df = pdf.groupby(["x", "y"], sort=True).rank( + method=method, ascending=ascending, na_option=na_option, pct=pct + ) + got_df = gdf.groupby(["x", "y"], sort=True).rank( + method=method, ascending=ascending, na_option=na_option, pct=pct + ) + + assert_groupby_results_equal(got_df, expect_df, check_dtype=False) + + +def test_groupby_rank_fails(): + gdf = cudf.DataFrame( + {"x": [1, 2, 3, 4], "y": [1, 2, 3, 4], "z": [1, 2, 3, 4]} + ) + with pytest.raises(NotImplementedError): + gdf.groupby(["x", "y"]).rank(method="min", axis=1) + gdf = cudf.DataFrame( + { + "a": [1, 1, 1, 2, 2, 2], + "b": [[1, 2], [3, None, 5], None, [], [7, 8], [9]], + } + ) + with pytest.raises(NotImplementedError): + gdf.groupby(["a"]).rank(method="min", axis=1) + + def test_groupby_mix_agg_scan(): err_msg = "Cannot perform both aggregation and scan in one operation" func = ["cumsum", "sum"] From 280acdfd65b12b4ac953c193c7d7fd35809e41be Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 28 Apr 2022 14:25:04 -0700 Subject: [PATCH 09/28] Partial cuIO GPU decompression refactor (#10699) Required to expand future nvcomp integration. - [x] Moving nvcomp integration in ORC and Parquet readers to common code. Enables nvcomp use for multiple compression type without code duplication. - [x] `gpu_inflate_input_s` refactor to facilitate unified host/device decompressor interface. Enables further changes to unify CPU and GPU decompression API, which in turn enables ZSTD use in ORC. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Elias Stehle (https://github.com/elstehle) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10699 --- cpp/CMakeLists.txt | 2 + cpp/src/io/avro/reader_impl.cu | 82 +++---- cpp/src/io/comp/debrotli.cu | 75 +++---- cpp/src/io/comp/gpuinflate.cu | 113 +++++----- cpp/src/io/comp/gpuinflate.h | 97 ++++----- cpp/src/io/comp/nvcomp_adapter.cpp | 86 ++++++++ cpp/src/io/comp/nvcomp_adapter.cu | 73 +++++++ cpp/src/io/comp/nvcomp_adapter.cuh | 55 +++++ cpp/src/io/comp/nvcomp_adapter.hpp | 45 ++++ cpp/src/io/comp/snap.cu | 45 ++-- cpp/src/io/comp/unsnap.cu | 68 +++--- cpp/src/io/orc/orc_gpu.h | 32 +-- cpp/src/io/orc/reader_impl.cu | 118 +++------- cpp/src/io/orc/stripe_enc.cu | 101 +++++---- cpp/src/io/orc/stripe_init.cu | 83 ++++--- cpp/src/io/orc/writer_impl.cu | 12 +- cpp/src/io/orc/writer_impl.hpp | 2 +- cpp/src/io/parquet/page_enc.cu | 42 ++-- cpp/src/io/parquet/parquet_gpu.hpp | 16 +- cpp/src/io/parquet/reader_impl.cu | 239 ++++++--------------- cpp/src/io/parquet/writer_impl.cu | 53 ++--- cpp/src/io/utilities/hostdevice_vector.hpp | 6 +- cpp/tests/io/comp/decomp_test.cpp | 100 ++++----- 23 files changed, 834 insertions(+), 711 deletions(-) create mode 100644 cpp/src/io/comp/nvcomp_adapter.cpp create mode 100644 cpp/src/io/comp/nvcomp_adapter.cu create mode 100644 cpp/src/io/comp/nvcomp_adapter.cuh create mode 100644 cpp/src/io/comp/nvcomp_adapter.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 91f67fd0420..15caaec9bec 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -302,6 +302,8 @@ add_library( src/io/comp/cpu_unbz2.cpp src/io/comp/debrotli.cu src/io/comp/gpuinflate.cu + src/io/comp/nvcomp_adapter.cpp + src/io/comp/nvcomp_adapter.cu src/io/comp/snap.cu src/io/comp/uncomp.cpp src/io/comp/unsnap.cu diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 5885b61b35b..556ca6b9d80 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -162,62 +162,66 @@ rmm::device_buffer decompress_data(datasource& source, rmm::cuda_stream_view stream) { if (meta.codec == "deflate") { - size_t uncompressed_data_size = 0; + auto inflate_in = hostdevice_vector>(meta.block_list.size(), stream); + auto inflate_out = hostdevice_vector>(meta.block_list.size(), stream); + auto inflate_stats = hostdevice_vector(meta.block_list.size(), stream); - auto inflate_in = hostdevice_vector(meta.block_list.size(), stream); - auto inflate_out = hostdevice_vector(meta.block_list.size(), stream); + // Guess an initial maximum uncompressed block size. We estimate the compression factor is two + // and round up to the next multiple of 4096 bytes. + uint32_t const initial_blk_len = meta.max_block_size * 2 + (meta.max_block_size * 2) % 4096; + size_t const uncomp_size = initial_blk_len * meta.block_list.size(); - // Guess an initial maximum uncompressed block size - uint32_t initial_blk_len = (meta.max_block_size * 2 + 0xfff) & ~0xfff; - uncompressed_data_size = initial_blk_len * meta.block_list.size(); - for (size_t i = 0; i < inflate_in.size(); ++i) { - inflate_in[i].dstSize = initial_blk_len; - } - - rmm::device_buffer decomp_block_data(uncompressed_data_size, stream); + rmm::device_buffer decomp_block_data(uncomp_size, stream); auto const base_offset = meta.block_list[0].offset; for (size_t i = 0, dst_pos = 0; i < meta.block_list.size(); i++) { auto const src_pos = meta.block_list[i].offset - base_offset; - inflate_in[i].srcDevice = static_cast(comp_block_data.data()) + src_pos; - inflate_in[i].srcSize = meta.block_list[i].size; - inflate_in[i].dstDevice = static_cast(decomp_block_data.data()) + dst_pos; + inflate_in[i] = {static_cast(comp_block_data.data()) + src_pos, + meta.block_list[i].size}; + inflate_out[i] = {static_cast(decomp_block_data.data()) + dst_pos, initial_blk_len}; // Update blocks offsets & sizes to refer to uncompressed data meta.block_list[i].offset = dst_pos; - meta.block_list[i].size = static_cast(inflate_in[i].dstSize); + meta.block_list[i].size = static_cast(inflate_out[i].size()); dst_pos += meta.block_list[i].size; } + inflate_in.host_to_device(stream); for (int loop_cnt = 0; loop_cnt < 2; loop_cnt++) { - inflate_in.host_to_device(stream); - CUDF_CUDA_TRY( - cudaMemsetAsync(inflate_out.device_ptr(), 0, inflate_out.memory_size(), stream.value())); - CUDF_CUDA_TRY(gpuinflate( - inflate_in.device_ptr(), inflate_out.device_ptr(), inflate_in.size(), 0, stream)); - inflate_out.device_to_host(stream, true); + inflate_out.host_to_device(stream); + CUDF_CUDA_TRY(cudaMemsetAsync( + inflate_stats.device_ptr(), 0, inflate_stats.memory_size(), stream.value())); + gpuinflate(inflate_in, inflate_out, inflate_stats, gzip_header_included::NO, stream); + inflate_stats.device_to_host(stream, true); // Check if larger output is required, as it's not known ahead of time if (loop_cnt == 0) { - size_t actual_uncompressed_size = 0; - for (size_t i = 0; i < meta.block_list.size(); i++) { - // If error status is 1 (buffer too small), the `bytes_written` field - // is actually contains the uncompressed data size - if (inflate_out[i].status == 1 && inflate_out[i].bytes_written > inflate_in[i].dstSize) { - inflate_in[i].dstSize = inflate_out[i].bytes_written; - } - actual_uncompressed_size += inflate_in[i].dstSize; - } - if (actual_uncompressed_size > uncompressed_data_size) { - decomp_block_data.resize(actual_uncompressed_size, stream); - for (size_t i = 0, dst_pos = 0; i < meta.block_list.size(); i++) { - auto dst_base = static_cast(decomp_block_data.data()); - inflate_in[i].dstDevice = dst_base + dst_pos; - - meta.block_list[i].offset = dst_pos; - meta.block_list[i].size = static_cast(inflate_in[i].dstSize); - dst_pos += meta.block_list[i].size; + std::vector actual_uncomp_sizes; + actual_uncomp_sizes.reserve(inflate_out.size()); + std::transform(inflate_out.begin(), + inflate_out.end(), + inflate_stats.begin(), + std::back_inserter(actual_uncomp_sizes), + [](auto const& inf_out, auto const& inf_stats) { + // If error status is 1 (buffer too small), the `bytes_written` field + // actually contains the uncompressed data size + return inf_stats.status == 1 + ? std::max(inf_out.size(), inf_stats.bytes_written) + : inf_out.size(); + }); + auto const total_actual_uncomp_size = + std::accumulate(actual_uncomp_sizes.cbegin(), actual_uncomp_sizes.cend(), 0ul); + if (total_actual_uncomp_size > uncomp_size) { + decomp_block_data.resize(total_actual_uncomp_size, stream); + for (size_t i = 0; i < meta.block_list.size(); ++i) { + meta.block_list[i].offset = + i > 0 ? (meta.block_list[i - 1].size + meta.block_list[i - 1].offset) : 0; + meta.block_list[i].size = static_cast(actual_uncomp_sizes[i]); + + inflate_out[i] = { + static_cast(decomp_block_data.data()) + meta.block_list[i].offset, + meta.block_list[i].size}; } } else { break; diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu index 631cf19b2aa..cf4d1b0e0f4 100644 --- a/cpp/src/io/comp/debrotli.cu +++ b/cpp/src/io/comp/debrotli.cu @@ -1904,41 +1904,42 @@ static __device__ void ProcessCommands(debrotli_state_s* s, const brotli_diction * * blockDim = {block_size,1,1} * - * @param[in] inputs Source/Destination buffer information per block - * @param[out] outputs Decompressor status per block + * @param[in] inputs Source buffer per block + * @param[out] outputs Destination buffer per block + * @param[out] statuses Decompressor status per block * @param scratch Intermediate device memory heap space (will be dynamically shared between blocks) * @param scratch_size Size of scratch heap space (smaller sizes may result in serialization between - *blocks) - * @param count Number of blocks to decompress + * blocks) */ -extern "C" __global__ void __launch_bounds__(block_size, 2) - gpu_debrotli_kernel(gpu_inflate_input_s* inputs, - gpu_inflate_status_s* outputs, +__global__ void __launch_bounds__(block_size, 2) + gpu_debrotli_kernel(device_span const> inputs, + device_span const> outputs, + device_span statuses, uint8_t* scratch, - uint32_t scratch_size, - uint32_t count) + uint32_t scratch_size) { __shared__ __align__(16) debrotli_state_s state_g; int t = threadIdx.x; - int z = blockIdx.x; + auto const block_id = blockIdx.x; debrotli_state_s* const s = &state_g; - if (z >= count) { return; } + if (block_id >= inputs.size()) { return; } // Thread0: initializes shared state and decode stream header if (!t) { - auto const* src = static_cast(inputs[z].srcDevice); - size_t src_size = inputs[z].srcSize; + auto const src = inputs[block_id].data(); + auto const src_size = inputs[block_id].size(); if (src && src_size >= 8) { - s->error = 0; - s->out = s->outbase = static_cast(inputs[z].dstDevice); - s->bytes_left = inputs[z].dstSize; - s->mtf_upper_bound = 63; - s->dist_rb[0] = 16; - s->dist_rb[1] = 15; - s->dist_rb[2] = 11; - s->dist_rb[3] = 4; - s->dist_rb_idx = 0; + s->error = 0; + s->out = outputs[block_id].data(); + s->outbase = s->out; + s->bytes_left = outputs[block_id].size(); + s->mtf_upper_bound = 63; + s->dist_rb[0] = 16; + s->dist_rb[1] = 15; + s->dist_rb[2] = 11; + s->dist_rb[3] = 4; + s->dist_rb_idx = 0; s->p1 = s->p2 = 0; initbits(s, src, src_size); DecodeStreamHeader(s); @@ -2015,9 +2016,10 @@ extern "C" __global__ void __launch_bounds__(block_size, 2) __syncthreads(); // Output decompression status if (!t) { - outputs[z].bytes_written = s->out - s->outbase; - outputs[z].status = s->error; - outputs[z].reserved = s->fb_size; // Return ext heap used by last block (statistics) + statuses[block_id].bytes_written = s->out - s->outbase; + statuses[block_id].status = s->error; + // Return ext heap used by last block (statistics) + statuses[block_id].reserved = s->fb_size; } } @@ -2075,20 +2077,21 @@ size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs) #include #endif -cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs, - gpu_inflate_status_s* outputs, - void* scratch, - size_t scratch_size, - int count, - rmm::cuda_stream_view stream) +void gpu_debrotli(device_span const> inputs, + device_span const> outputs, + device_span statuses, + void* scratch, + size_t scratch_size, + rmm::cuda_stream_view stream) { - uint32_t count32 = (count > 0) ? count : 0; + auto const count = inputs.size(); uint32_t fb_heap_size; auto* scratch_u8 = static_cast(scratch); dim3 dim_block(block_size, 1); - dim3 dim_grid(count32, 1); // TODO: Check max grid dimensions vs max expected count + dim3 dim_grid(count, 1); // TODO: Check max grid dimensions vs max expected count - if (scratch_size < sizeof(brotli_dictionary_s)) { return cudaErrorLaunchOutOfResources; } + CUDF_EXPECTS(scratch_size >= sizeof(brotli_dictionary_s), + "Insufficient scratch space for debrotli"); scratch_size = min(scratch_size, (size_t)0xffffffffu); fb_heap_size = (uint32_t)((scratch_size - sizeof(brotli_dictionary_s)) & ~0xf); @@ -2101,7 +2104,7 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs, cudaMemcpyHostToDevice, stream.value())); gpu_debrotli_kernel<<>>( - inputs, outputs, scratch_u8, fb_heap_size, count32); + inputs, outputs, statuses, scratch_u8, fb_heap_size); #if DUMP_FB_HEAP uint32_t dump[2]; uint32_t cur = 0; @@ -2114,8 +2117,6 @@ cudaError_t __host__ gpu_debrotli(gpu_inflate_input_s* inputs, cur = (dump[0] > cur) ? dump[0] : 0xffffffffu; } #endif - - return cudaSuccess; } } // namespace io diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu index 508e960430d..0d33158da2b 100644 --- a/cpp/src/io/comp/gpuinflate.cu +++ b/cpp/src/io/comp/gpuinflate.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -124,8 +124,8 @@ struct inflate_state_s { uint8_t* outbase; ///< start of output buffer uint8_t* outend; ///< end of output buffer // Input state - uint8_t* cur; ///< input buffer - uint8_t* end; ///< end of input buffer + uint8_t const* cur; ///< input buffer + uint8_t const* end; ///< end of input buffer uint2 bitbuf; ///< bit buffer (64-bit) uint32_t bitpos; ///< position in bit buffer @@ -180,10 +180,10 @@ inline __device__ void skipbits(inflate_state_s* s, uint32_t n) { uint32_t bitpos = s->bitpos + n; if (bitpos >= 32) { - uint8_t* cur = s->cur + 8; - s->bitbuf.x = s->bitbuf.y; - s->bitbuf.y = (cur < s->end) ? *reinterpret_cast(cur) : 0; - s->cur = cur - 4; + auto cur = s->cur + 8; + s->bitbuf.x = s->bitbuf.y; + s->bitbuf.y = (cur < s->end) ? *reinterpret_cast(cur) : 0; + s->cur = cur - 4; bitpos &= 0x1f; } s->bitpos = bitpos; @@ -510,8 +510,8 @@ __device__ void decode_symbols(inflate_state_s* s) { uint32_t bitpos = s->bitpos; uint2 bitbuf = s->bitbuf; - uint8_t* cur = s->cur; - uint8_t* end = s->end; + auto cur = s->cur; + auto end = s->end; int32_t batch = 0; int32_t sym, batch_len; @@ -871,13 +871,11 @@ __device__ int init_stored(inflate_state_s* s) /// Copy bytes from stored block to destination __device__ void copy_stored(inflate_state_s* s, int t) { - int len = s->stored_blk_len; - uint8_t* cur = s->cur + (s->bitpos >> 3); - uint8_t* out = s->out; - uint8_t* outend = s->outend; - uint8_t* cur4; - int slow_bytes = min(len, (int)((16 - (size_t)out) & 0xf)); - int fast_bytes, bitpos; + auto len = s->stored_blk_len; + auto cur = s->cur + s->bitpos / 8; + auto out = s->out; + auto outend = s->outend; + auto const slow_bytes = min(len, (int)((16 - reinterpret_cast(out)) % 16)); // Slow copy until output is 16B aligned if (slow_bytes) { @@ -890,11 +888,11 @@ __device__ void copy_stored(inflate_state_s* s, int t) out += slow_bytes; len -= slow_bytes; } - fast_bytes = len; + auto fast_bytes = len; if (out < outend) { fast_bytes = (int)min((size_t)fast_bytes, (outend - out)); } fast_bytes &= ~0xf; - bitpos = ((int)(3 & (size_t)cur)) << 3; - cur4 = cur - (bitpos >> 3); + auto bitpos = ((int)((size_t)cur % 4)) * 8; + auto cur4 = cur - (bitpos / 8); if (out < outend) { // Fast copy 16 bytes at a time for (int i = t * 16; i < fast_bytes; i += blockDim.x * 16) { @@ -926,13 +924,13 @@ __device__ void copy_stored(inflate_state_s* s, int t) __syncthreads(); if (t == 0) { // Reset bitstream to end of block - uint8_t* p = cur + len; + auto p = cur + len; auto prefix_bytes = (uint32_t)(((size_t)p) & 3); p -= prefix_bytes; s->cur = p; - s->bitbuf.x = (p < s->end) ? *reinterpret_cast(p) : 0; + s->bitbuf.x = (p < s->end) ? *reinterpret_cast(p) : 0; p += 4; - s->bitbuf.y = (p < s->end) ? *reinterpret_cast(p) : 0; + s->bitbuf.y = (p < s->end) ? *reinterpret_cast(p) : 0; s->bitpos = prefix_bytes * 8; s->out = out; } @@ -1021,12 +1019,16 @@ __device__ int parse_gzip_header(const uint8_t* src, size_t src_size) * * @tparam block_size Thread block dimension for this call * @param inputs Source and destination buffer information per block - * @param outputs Decompression status buffer per block + * @param outputs Destination buffer information per block + * @param statuses Decompression status buffer per block * @param parse_hdr If nonzero, indicates that the compressed bitstream includes a GZIP header */ template __global__ void __launch_bounds__(block_size) - inflate_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, int parse_hdr) + inflate_kernel(device_span const> inputs, + device_span const> outputs, + device_span statuses, + gzip_header_included parse_hdr) { __shared__ __align__(16) inflate_state_s state_g; @@ -1035,12 +1037,11 @@ __global__ void __launch_bounds__(block_size) inflate_state_s* state = &state_g; if (!t) { - auto* p = const_cast(static_cast(inputs[z].srcDevice)); - size_t src_size = inputs[z].srcSize; - uint32_t prefix_bytes; + auto p = inputs[z].data(); + auto src_size = inputs[z].size(); // Parse header if needed state->err = 0; - if (parse_hdr) { + if (parse_hdr == gzip_header_included::YES) { int hdr_len = parse_gzip_header(p, src_size); src_size = (src_size >= 8) ? src_size - 8 : 0; // ignore footer if (hdr_len >= 0) { @@ -1051,16 +1052,16 @@ __global__ void __launch_bounds__(block_size) } } // Initialize shared state - state->out = const_cast(static_cast(inputs[z].dstDevice)); - state->outbase = state->out; - state->outend = state->out + inputs[z].dstSize; - state->end = p + src_size; - prefix_bytes = (uint32_t)(((size_t)p) & 3); + state->out = outputs[z].data(); + state->outbase = state->out; + state->outend = state->out + outputs[z].size(); + state->end = p + src_size; + auto const prefix_bytes = (uint32_t)(((size_t)p) & 3); p -= prefix_bytes; state->cur = p; - state->bitbuf.x = (p < state->end) ? *reinterpret_cast(p) : 0; + state->bitbuf.x = (p < state->end) ? *reinterpret_cast(p) : 0; p += 4; - state->bitbuf.y = (p < state->end) ? *reinterpret_cast(p) : 0; + state->bitbuf.y = (p < state->end) ? *reinterpret_cast(p) : 0; state->bitpos = prefix_bytes * 8; } __syncthreads(); @@ -1132,9 +1133,9 @@ __global__ void __launch_bounds__(block_size) // Output buffer too small state->err = 1; } - outputs[z].bytes_written = state->out - state->outbase; - outputs[z].status = state->err; - outputs[z].reserved = (int)(state->end - state->cur); // Here mainly for debug purposes + statuses[z].bytes_written = state->out - state->outbase; + statuses[z].status = state->err; + statuses[z].reserved = (int)(state->end - state->cur); // Here mainly for debug purposes } } @@ -1145,7 +1146,9 @@ __global__ void __launch_bounds__(block_size) * * @param inputs Source and destination information per block */ -__global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_input_s* inputs) +__global__ void __launch_bounds__(1024) + copy_uncompressed_kernel(device_span const> inputs, + device_span const> outputs) { __shared__ const uint8_t* volatile src_g; __shared__ uint8_t* volatile dst_g; @@ -1158,9 +1161,9 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp uint32_t len, src_align_bytes, src_align_bits, dst_align_bytes; if (!t) { - src = static_cast(inputs[z].srcDevice); - dst = static_cast(inputs[z].dstDevice); - len = min((uint32_t)inputs[z].srcSize, (uint32_t)inputs[z].dstSize); + src = inputs[z].data(); + dst = outputs[z].data(); + len = static_cast(min(inputs[z].size(), outputs[z].size())); src_g = src; dst_g = dst; copy_len_g = len; @@ -1195,26 +1198,26 @@ __global__ void __launch_bounds__(1024) copy_uncompressed_kernel(gpu_inflate_inp if (t < len) { dst[t] = src[t]; } } -cudaError_t __host__ gpuinflate(gpu_inflate_input_s* inputs, - gpu_inflate_status_s* outputs, - int count, - int parse_hdr, - rmm::cuda_stream_view stream) +void gpuinflate(device_span const> inputs, + device_span const> outputs, + device_span statuses, + gzip_header_included parse_hdr, + rmm::cuda_stream_view stream) { constexpr int block_size = 128; // Threads per block - if (count > 0) { + if (inputs.size() > 0) { inflate_kernel - <<>>(inputs, outputs, parse_hdr); + <<>>(inputs, outputs, statuses, parse_hdr); } - return cudaSuccess; } -cudaError_t __host__ gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs, - int count, - rmm::cuda_stream_view stream) +void gpu_copy_uncompressed_blocks(device_span const> inputs, + device_span const> outputs, + rmm::cuda_stream_view stream) { - if (count > 0) { copy_uncompressed_kernel<<>>(inputs); } - return cudaSuccess; + if (inputs.size() > 0) { + copy_uncompressed_kernel<<>>(inputs, outputs); + } } } // namespace io diff --git a/cpp/src/io/comp/gpuinflate.h b/cpp/src/io/comp/gpuinflate.h index 29856bcd3f3..3870b2ac3b3 100644 --- a/cpp/src/io/comp/gpuinflate.h +++ b/cpp/src/io/comp/gpuinflate.h @@ -16,75 +16,70 @@ #pragma once -#include +#include #include +#include + namespace cudf { namespace io { -/** - * @brief Input parameters for the decompression interface - */ -struct gpu_inflate_input_s { - const void* srcDevice; - uint64_t srcSize; - void* dstDevice; - uint64_t dstSize; -}; /** * @brief Output parameters for the decompression interface */ -struct gpu_inflate_status_s { +struct decompress_status { uint64_t bytes_written; uint32_t status; uint32_t reserved; }; +enum class gzip_header_included { NO, YES }; + /** * @brief Interface for decompressing GZIP-compressed data * * Multiple, independent chunks of compressed data can be decompressed by using - * separate gpu_inflate_input_s/gpu_inflate_status_s pairs for each chunk. + * separate input/output/status for each chunk. * - * @param[in] inputs List of input argument structures - * @param[out] outputs List of output status structures - * @param[in] count Number of input/output structures + * @param[in] inputs List of input buffers + * @param[out] outputs List of output buffers + * @param[out] statuses List of output status structures * @param[in] parse_hdr Whether or not to parse GZIP header * @param[in] stream CUDA stream to use */ -cudaError_t gpuinflate(gpu_inflate_input_s* inputs, - gpu_inflate_status_s* outputs, - int count, - int parse_hdr, - rmm::cuda_stream_view stream); +void gpuinflate(device_span const> inputs, + device_span const> outputs, + device_span statuses, + gzip_header_included parse_hdr, + rmm::cuda_stream_view stream); /** * @brief Interface for copying uncompressed byte blocks * - * @param[in] inputs List of input argument structures - * @param[in] count Number of input structures + * @param[in] inputs List of input buffers + * @param[out] outputs List of output buffers * @param[in] stream CUDA stream to use */ -cudaError_t gpu_copy_uncompressed_blocks(gpu_inflate_input_s* inputs, - int count, - rmm::cuda_stream_view stream); +void gpu_copy_uncompressed_blocks(device_span const> inputs, + device_span const> outputs, + rmm::cuda_stream_view stream); /** * @brief Interface for decompressing Snappy-compressed data * * Multiple, independent chunks of compressed data can be decompressed by using - * separate gpu_inflate_input_s/gpu_inflate_status_s pairs for each chunk. + * separate input/output/status for each chunk. * - * @param[in] inputs List of input argument structures - * @param[out] outputs List of output status structures - * @param[in] count Number of input/output structures + * @param[in] inputs List of input buffers + * @param[out] outputs List of output buffers + * @param[out] statuses List of output status structures * @param[in] stream CUDA stream to use */ -cudaError_t gpu_unsnap(gpu_inflate_input_s* inputs, - gpu_inflate_status_s* outputs, - int count, - rmm::cuda_stream_view stream); +void gpu_unsnap(device_span const> inputs, + device_span const> outputs, + device_span statuses, + rmm::cuda_stream_view stream); /** * @brief Computes the size of temporary memory for Brotli decompression @@ -99,37 +94,37 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0); * @brief Interface for decompressing Brotli-compressed data * * Multiple, independent chunks of compressed data can be decompressed by using - * separate gpu_inflate_input_s/gpu_inflate_status_s pairs for each chunk. + * separate input/output/status pairs for each chunk. * - * @param[in] inputs List of input argument structures - * @param[out] outputs List of output status structures + * @param[in] inputs List of input buffers + * @param[out] outputs List of output buffers + * @param[out] statuses List of output status structures * @param[in] scratch Temporary memory for intermediate work * @param[in] scratch_size Size in bytes of the temporary memory - * @param[in] count Number of input/output structures * @param[in] stream CUDA stream to use */ -cudaError_t gpu_debrotli(gpu_inflate_input_s* inputs, - gpu_inflate_status_s* outputs, - void* scratch, - size_t scratch_size, - int count, - rmm::cuda_stream_view stream); +void gpu_debrotli(device_span const> inputs, + device_span const> outputs, + device_span statuses, + void* scratch, + size_t scratch_size, + rmm::cuda_stream_view stream); /** * @brief Interface for compressing data with Snappy * * Multiple, independent chunks of compressed data can be compressed by using - * separate gpu_inflate_input_s/gpu_inflate_status_s pairs for each chunk. + * separate input/output/status for each chunk. * - * @param[in] inputs List of input argument structures - * @param[out] outputs List of output status structures - * @param[in] count Number of input/output structures + * @param[in] inputs List of input buffers + * @param[out] outputs List of output buffers + * @param[out] statuses List of output status structures * @param[in] stream CUDA stream to use */ -cudaError_t gpu_snap(gpu_inflate_input_s* inputs, - gpu_inflate_status_s* outputs, - int count, - rmm::cuda_stream_view stream); +void gpu_snap(device_span const> inputs, + device_span const> outputs, + device_span statuses, + rmm::cuda_stream_view stream); } // namespace io } // namespace cudf diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp new file mode 100644 index 00000000000..b2e6f07b80b --- /dev/null +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nvcomp_adapter.hpp" +#include "nvcomp_adapter.cuh" + +#include + +#include + +namespace cudf::io::nvcomp { + +template +auto batched_decompress_get_temp_size(compression_type type, Args&&... args) +{ + switch (type) { + case compression_type::SNAPPY: + return nvcompBatchedSnappyDecompressGetTempSize(std::forward(args)...); + default: CUDF_FAIL("Unsupported compression type"); + } +}; + +template +auto batched_decompress_async(compression_type type, Args&&... args) +{ + switch (type) { + case compression_type::SNAPPY: + return nvcompBatchedSnappyDecompressAsync(std::forward(args)...); + default: CUDF_FAIL("Unsupported compression type"); + } +}; + +size_t get_temp_size(compression_type type, size_t num_chunks, size_t max_uncomp_chunk_size) +{ + size_t temp_size = 0; + nvcompStatus_t nvcomp_status = + batched_decompress_get_temp_size(type, num_chunks, max_uncomp_chunk_size, &temp_size); + CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, + "Unable to get scratch size for decompression"); + + return temp_size; +} + +void batched_decompress(compression_type type, + device_span const> inputs, + device_span const> outputs, + device_span statuses, + size_t max_uncomp_chunk_size, + rmm::cuda_stream_view stream) +{ + auto const num_chunks = inputs.size(); + + // cuDF inflate inputs converted to nvcomp inputs + auto const nvcomp_args = create_batched_nvcomp_args(inputs, outputs, stream); + rmm::device_uvector actual_uncompressed_data_sizes(num_chunks, stream); + rmm::device_uvector nvcomp_statuses(num_chunks, stream); + // Temporary space required for decompression + rmm::device_buffer scratch(get_temp_size(type, num_chunks, max_uncomp_chunk_size), stream); + auto const nvcomp_status = batched_decompress_async(type, + nvcomp_args.compressed_data_ptrs.data(), + nvcomp_args.compressed_data_sizes.data(), + nvcomp_args.uncompressed_data_sizes.data(), + actual_uncompressed_data_sizes.data(), + num_chunks, + scratch.data(), + scratch.size(), + nvcomp_args.uncompressed_data_ptrs.data(), + nvcomp_statuses.data(), + stream.value()); + CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, "unable to perform decompression"); + + convert_status(nvcomp_statuses, actual_uncompressed_data_sizes, statuses, stream); +} +} // namespace cudf::io::nvcomp diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu new file mode 100644 index 00000000000..ce294cc9b00 --- /dev/null +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "nvcomp_adapter.cuh" + +#include + +#include + +namespace cudf::io::nvcomp { + +batched_args create_batched_nvcomp_args(device_span const> inputs, + device_span const> outputs, + rmm::cuda_stream_view stream) +{ + size_t num_comp_pages = inputs.size(); + rmm::device_uvector compressed_data_ptrs(num_comp_pages, stream); + rmm::device_uvector compressed_data_sizes(num_comp_pages, stream); + rmm::device_uvector uncompressed_data_ptrs(num_comp_pages, stream); + rmm::device_uvector uncompressed_data_sizes(num_comp_pages, stream); + + // Prepare the input vectors + auto ins_it = + thrust::make_zip_iterator(compressed_data_ptrs.begin(), compressed_data_sizes.begin()); + thrust::transform( + rmm::exec_policy(stream), inputs.begin(), inputs.end(), ins_it, [] __device__(auto const& in) { + return thrust::make_tuple(in.data(), in.size()); + }); + + // Prepare the output vectors + auto outs_it = + thrust::make_zip_iterator(uncompressed_data_ptrs.begin(), uncompressed_data_sizes.begin()); + thrust::transform( + rmm::exec_policy(stream), + outputs.begin(), + outputs.end(), + outs_it, + [] __device__(auto const& out) { return thrust::make_tuple(out.data(), out.size()); }); + + return {std::move(compressed_data_ptrs), + std::move(compressed_data_sizes), + std::move(uncompressed_data_ptrs), + std::move(uncompressed_data_sizes)}; +} + +void convert_status(device_span nvcomp_stats, + device_span actual_uncompressed_sizes, + device_span cudf_stats, + rmm::cuda_stream_view stream) +{ + thrust::transform( + rmm::exec_policy(stream), + nvcomp_stats.begin(), + nvcomp_stats.end(), + actual_uncompressed_sizes.begin(), + cudf_stats.begin(), + [] __device__(auto const& status, auto const& size) { + return decompress_status{size, status == nvcompStatus_t::nvcompSuccess ? 0u : 1u}; + }); +} +} // namespace cudf::io::nvcomp diff --git a/cpp/src/io/comp/nvcomp_adapter.cuh b/cpp/src/io/comp/nvcomp_adapter.cuh new file mode 100644 index 00000000000..a76ddcf6813 --- /dev/null +++ b/cpp/src/io/comp/nvcomp_adapter.cuh @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "gpuinflate.h" + +#include + +#include + +#include +#include + +namespace cudf::io::nvcomp { + +struct batched_args { + rmm::device_uvector compressed_data_ptrs; + rmm::device_uvector compressed_data_sizes; + rmm::device_uvector uncompressed_data_ptrs; + rmm::device_uvector uncompressed_data_sizes; +}; + +/** + * @brief Split lists of src/dst device spans into lists of pointers/sizes. + * + * @param[in] inputs List of input buffers + * @param[in] outputs List of output buffers + * @param[in] stream CUDA stream to use + */ +batched_args create_batched_nvcomp_args(device_span const> inputs, + device_span const> outputs, + rmm::cuda_stream_view stream); + +/** + * @brief Convert nvcomp statuses into cuIO compression statuses. + */ +void convert_status(device_span nvcomp_stats, + device_span actual_uncompressed_sizes, + device_span cudf_stats, + rmm::cuda_stream_view stream); +} // namespace cudf::io::nvcomp diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp new file mode 100644 index 00000000000..a0eb6bc4fbf --- /dev/null +++ b/cpp/src/io/comp/nvcomp_adapter.hpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "gpuinflate.h" + +#include + +#include + +namespace cudf::io::nvcomp { + +enum class compression_type { SNAPPY }; + +/** + * @brief Device batch decompression of given type. + * + * @param[in] type Compression type + * @param[in] inputs List of input buffers + * @param[out] outputs List of output buffers + * @param[out] statuses List of output status structures + * @param[in] max_uncomp_page_size maximum size of uncompressed block + * @param[in] stream CUDA stream to use + */ +void batched_decompress(compression_type type, + device_span const> inputs, + device_span const> outputs, + device_span statuses, + size_t max_uncomp_page_size, + rmm::cuda_stream_view stream); +} // namespace cudf::io::nvcomp diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu index 9f0a610f8f7..d64eea06631 100644 --- a/cpp/src/io/comp/snap.cu +++ b/cpp/src/io/comp/snap.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -258,7 +258,9 @@ static __device__ uint32_t Match60(const uint8_t* src1, * @param[in] count Number of blocks to compress */ __global__ void __launch_bounds__(128) - snap_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs, int count) + snap_kernel(device_span const> inputs, + device_span const> outputs, + device_span statuses) { __shared__ __align__(16) snap_state_s state_g; @@ -268,15 +270,15 @@ __global__ void __launch_bounds__(128) const uint8_t* src; if (!t) { - const auto* src = static_cast(inputs[blockIdx.x].srcDevice); - auto src_len = static_cast(inputs[blockIdx.x].srcSize); - auto* dst = static_cast(inputs[blockIdx.x].dstDevice); - auto dst_len = static_cast(inputs[blockIdx.x].dstSize); - uint8_t* end = dst + dst_len; - s->src = src; - s->src_len = src_len; - s->dst_base = dst; - s->end = end; + auto const src = inputs[blockIdx.x].data(); + auto src_len = static_cast(inputs[blockIdx.x].size()); + auto dst = outputs[blockIdx.x].data(); + auto const dst_len = static_cast(outputs[blockIdx.x].size()); + auto const end = dst + dst_len; + s->src = src; + s->src_len = src_len; + s->dst_base = dst; + s->end = end; while (src_len > 0x7f) { if (dst < end) { dst[0] = src_len | 0x80; } dst++; @@ -335,23 +337,22 @@ __global__ void __launch_bounds__(128) } __syncthreads(); if (!t) { - outputs[blockIdx.x].bytes_written = s->dst - s->dst_base; - outputs[blockIdx.x].status = (s->dst > s->end) ? 1 : 0; - outputs[blockIdx.x].reserved = 0; + statuses[blockIdx.x].bytes_written = s->dst - s->dst_base; + statuses[blockIdx.x].status = (s->dst > s->end) ? 1 : 0; + statuses[blockIdx.x].reserved = 0; } } -cudaError_t __host__ gpu_snap(gpu_inflate_input_s* inputs, - gpu_inflate_status_s* outputs, - int count, - rmm::cuda_stream_view stream) +void gpu_snap(device_span const> inputs, + device_span const> outputs, + device_span statuses, + rmm::cuda_stream_view stream) { dim3 dim_block(128, 1); // 4 warps per stream, 1 stream per block - dim3 dim_grid(count, 1); - if (count > 0) { - snap_kernel<<>>(inputs, outputs, count); + dim3 dim_grid(inputs.size(), 1); + if (inputs.size() > 0) { + snap_kernel<<>>(inputs, outputs, statuses); } - return cudaSuccess; } } // namespace io diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu index 791a16bc912..dc44b9fcd59 100644 --- a/cpp/src/io/comp/unsnap.cu +++ b/cpp/src/io/comp/unsnap.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -64,14 +64,15 @@ struct unsnap_queue_s { * @brief snappy decompression state */ struct unsnap_state_s { - const uint8_t* base; ///< base ptr of compressed stream - const uint8_t* end; ///< end of compressed stream - uint32_t uncompressed_size; ///< uncompressed stream size - uint32_t bytes_left; ///< bytes to uncompressed remaining - int32_t error; ///< current error status - uint32_t tstart; ///< start time for perf logging - volatile unsnap_queue_s q; ///< queue for cross-warp communication - gpu_inflate_input_s in; ///< input parameters for current block + const uint8_t* base; ///< base ptr of compressed stream + const uint8_t* end; ///< end of compressed stream + uint32_t uncompressed_size; ///< uncompressed stream size + uint32_t bytes_left; ///< remaining bytes to decompress + int32_t error; ///< current error status + uint32_t tstart; ///< start time for perf logging + volatile unsnap_queue_s q; ///< queue for cross-warp communication + device_span src; ///< input for current block + device_span dst; ///< output for current block }; inline __device__ volatile uint8_t& byte_access(unsnap_state_s* s, uint32_t pos) @@ -497,9 +498,9 @@ __device__ void snappy_decode_symbols(unsnap_state_s* s, uint32_t t) template __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_storage) { - const uint8_t* literal_base = s->base; - auto* out = static_cast(s->in.dstDevice); - int batch = 0; + auto const literal_base = s->base; + auto out = s->dst.data(); + int batch = 0; do { volatile unsnap_batch_s* b = &s->q.batch[batch * batch_size]; @@ -624,7 +625,9 @@ __device__ void snappy_process_symbols(unsnap_state_s* s, int t, Storage& temp_s */ template __global__ void __launch_bounds__(block_size) - unsnap_kernel(gpu_inflate_input_s* inputs, gpu_inflate_status_s* outputs) + unsnap_kernel(device_span const> inputs, + device_span const> outputs, + device_span statuses) { __shared__ __align__(16) unsnap_state_s state_g; __shared__ cub::WarpReduce::TempStorage temp_storage; @@ -632,16 +635,14 @@ __global__ void __launch_bounds__(block_size) unsnap_state_s* s = &state_g; int strm_id = blockIdx.x; - if (t < sizeof(gpu_inflate_input_s) / sizeof(uint32_t)) { - reinterpret_cast(&s->in)[t] = reinterpret_cast(&inputs[strm_id])[t]; - __threadfence_block(); - } if (t < batch_count) { s->q.batch_len[t] = 0; } __syncthreads(); if (!t) { - const auto* cur = static_cast(s->in.srcDevice); - const uint8_t* end = cur + s->in.srcSize; - s->error = 0; + s->src = inputs[strm_id]; + s->dst = outputs[strm_id]; + auto cur = s->src.begin(); + auto const end = s->src.end(); + s->error = 0; if (log_cyclecount) { s->tstart = clock(); } if (cur < end) { // Read uncompressed size (varint), limited to 32-bit @@ -672,7 +673,7 @@ __global__ void __launch_bounds__(block_size) s->bytes_left = uncompressed_size; s->base = cur; s->end = end; - if ((cur >= end && uncompressed_size != 0) || (uncompressed_size > s->in.dstSize)) { + if ((cur >= end && uncompressed_size != 0) || (uncompressed_size > s->dst.size())) { s->error = -1; } } else { @@ -697,28 +698,25 @@ __global__ void __launch_bounds__(block_size) __syncthreads(); } if (!t) { - outputs[strm_id].bytes_written = s->uncompressed_size - s->bytes_left; - outputs[strm_id].status = s->error; + statuses[strm_id].bytes_written = s->uncompressed_size - s->bytes_left; + statuses[strm_id].status = s->error; if (log_cyclecount) { - outputs[strm_id].reserved = clock() - s->tstart; + statuses[strm_id].reserved = clock() - s->tstart; } else { - outputs[strm_id].reserved = 0; + statuses[strm_id].reserved = 0; } } } -cudaError_t __host__ gpu_unsnap(gpu_inflate_input_s* inputs, - gpu_inflate_status_s* outputs, - int count, - rmm::cuda_stream_view stream) +void gpu_unsnap(device_span const> inputs, + device_span const> outputs, + device_span statuses, + rmm::cuda_stream_view stream) { - uint32_t count32 = (count > 0) ? count : 0; - dim3 dim_block(128, 1); // 4 warps per stream, 1 stream per block - dim3 dim_grid(count32, 1); // TODO: Check max grid dimensions vs max expected count - - unsnap_kernel<128><<>>(inputs, outputs); + dim3 dim_block(128, 1); // 4 warps per stream, 1 stream per block + dim3 dim_grid(inputs.size(), 1); // TODO: Check max grid dimensions vs max expected count - return cudaSuccess; + unsnap_kernel<128><<>>(inputs, outputs, statuses); } } // namespace io diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h index d94aa00c7b9..837fd03a112 100644 --- a/cpp/src/io/orc/orc_gpu.h +++ b/cpp/src/io/orc/orc_gpu.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,9 +43,10 @@ struct CompressedStreamInfo { : compressed_data(compressed_data_), uncompressed_data(nullptr), compressed_data_size(compressed_size_), - decctl(nullptr), - decstatus(nullptr), - copyctl(nullptr), + dec_in_ctl(nullptr), + dec_out_ctl(nullptr), + copy_in_ctl(nullptr), + copy_out_ctl(nullptr), num_compressed_blocks(0), num_uncompressed_blocks(0), max_uncompressed_size(0), @@ -54,14 +55,15 @@ struct CompressedStreamInfo { } const uint8_t* compressed_data; // [in] base ptr to compressed stream data uint8_t* uncompressed_data; // [in] base ptr to uncompressed stream data or NULL if not known yet - size_t compressed_data_size; // [in] compressed data size for this stream - gpu_inflate_input_s* decctl; // [in] base ptr to decompression structure to be filled - gpu_inflate_status_s* decstatus; // [in] results of decompression - gpu_inflate_input_s* - copyctl; // [in] base ptr to copy structure to be filled for uncompressed blocks + size_t compressed_data_size; // [in] compressed data size for this stream + device_span* dec_in_ctl; // [in] input buffer to decompress + device_span* dec_out_ctl; // [in] output buffer to decompress into + device_span decstatus; // [in] results of decompression + device_span* copy_in_ctl; // [out] input buffer to copy + device_span* copy_out_ctl; // [out] output buffer to copy to uint32_t num_compressed_blocks; // [in,out] number of entries in decctl(in), number of compressed // blocks(out) - uint32_t num_uncompressed_blocks; // [in,out] number of entries in copyctl(in), number of + uint32_t num_uncompressed_blocks; // [in,out] number of entries in dec_in_ctl(in), number of // uncompressed blocks(out) uint64_t max_uncompressed_size; // [out] maximum uncompressed data size of stream uint32_t max_uncompressed_block_size; // [out] maximum uncompressed size of any block in stream @@ -345,8 +347,9 @@ void CompactOrcDataStreams(device_2dspan strm_desc, * @param[in] max_comp_blk_size Max size of any block after compression * @param[in,out] strm_desc StripeStream device array [stripe][stream] * @param[in,out] enc_streams chunk streams device array [column][rowgroup] - * @param[out] comp_in Per-block compression input parameters - * @param[out] comp_out Per-block compression status + * @param[out] comp_in Per-block compression input buffers + * @param[out] comp_out Per-block compression output buffers + * @param[out] comp_stat Per-block compression status * @param[in] stream CUDA stream used for device memory operations and kernel launches */ void CompressOrcDataStreams(uint8_t* compressed_data, @@ -356,8 +359,9 @@ void CompressOrcDataStreams(uint8_t* compressed_data, uint32_t max_comp_blk_size, device_2dspan strm_desc, device_2dspan enc_streams, - device_span comp_in, - device_span comp_out, + device_span> comp_in, + device_span> comp_out, + device_span comp_stat, rmm::cuda_stream_view stream); /** diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index a768d568178..139eb28d1a1 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -25,6 +25,7 @@ #include "timezone.cuh" #include +#include #include #include @@ -40,8 +41,6 @@ #include #include -#include - #include #include #include @@ -262,7 +261,7 @@ auto decimal_column_type(std::vector const& decimal128_columns, } // namespace -__global__ void decompress_check_kernel(device_span stats, +__global__ void decompress_check_kernel(device_span stats, bool* any_block_failure) { auto tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -273,7 +272,7 @@ __global__ void decompress_check_kernel(device_span } } -void decompress_check(device_span stats, +void decompress_check(device_span stats, bool* any_block_failure, rmm::cuda_stream_view stream) { @@ -284,74 +283,6 @@ void decompress_check(device_span stats, decompress_check_kernel<<>>(stats, any_block_failure); } -__global__ void convert_nvcomp_status(device_span nvcomp_stats, - device_span actual_uncompressed_sizes, - device_span stats) -{ - auto tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < stats.size()) { - stats[tid].status = nvcomp_stats[tid] == nvcompStatus_t::nvcompSuccess ? 0 : 1; - stats[tid].bytes_written = actual_uncompressed_sizes[tid]; - } -} - -void snappy_decompress(device_span comp_in, - device_span comp_stat, - size_t max_uncomp_page_size, - rmm::cuda_stream_view stream) -{ - size_t num_blocks = comp_in.size(); - size_t temp_size; - - auto status = - nvcompBatchedSnappyDecompressGetTempSize(num_blocks, max_uncomp_page_size, &temp_size); - CUDF_EXPECTS(nvcompStatus_t::nvcompSuccess == status, - "Unable to get scratch size for snappy decompression"); - - rmm::device_buffer scratch(temp_size, stream); - rmm::device_uvector compressed_data_ptrs(num_blocks, stream); - rmm::device_uvector compressed_data_sizes(num_blocks, stream); - rmm::device_uvector uncompressed_data_ptrs(num_blocks, stream); - rmm::device_uvector uncompressed_data_sizes(num_blocks, stream); - - rmm::device_uvector actual_uncompressed_data_sizes(num_blocks, stream); - rmm::device_uvector statuses(num_blocks, stream); - - device_span actual_uncompressed_sizes_span(actual_uncompressed_data_sizes.data(), - actual_uncompressed_data_sizes.size()); - device_span statuses_span(statuses.data(), statuses.size()); - - // Prepare the vectors - auto comp_it = thrust::make_zip_iterator(compressed_data_ptrs.begin(), - compressed_data_sizes.begin(), - uncompressed_data_ptrs.begin(), - uncompressed_data_sizes.data()); - thrust::transform(rmm::exec_policy(stream), - comp_in.begin(), - comp_in.end(), - comp_it, - [] __device__(gpu_inflate_input_s in) { - return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice, in.dstSize); - }); - - status = nvcompBatchedSnappyDecompressAsync(compressed_data_ptrs.data(), - compressed_data_sizes.data(), - uncompressed_data_sizes.data(), - actual_uncompressed_data_sizes.data(), - num_blocks, - scratch.data(), - scratch.size(), - uncompressed_data_ptrs.data(), - statuses.data(), - stream.value()); - CUDF_EXPECTS(nvcompStatus_t::nvcompSuccess == status, "unable to perform snappy decompression"); - - dim3 block(128); - dim3 grid(cudf::util::div_rounding_up_safe(num_blocks, static_cast(block.x))); - convert_nvcomp_status<<>>( - statuses_span, actual_uncompressed_sizes_span, comp_stat); -} - rmm::device_buffer reader::impl::decompress_stripe_data( cudf::detail::hostdevice_2dvector& chunks, const std::vector& stripe_data, @@ -396,9 +327,11 @@ rmm::device_buffer reader::impl::decompress_stripe_data( CUDF_EXPECTS(total_decomp_size > 0, "No decompressible data found"); rmm::device_buffer decomp_data(total_decomp_size, stream); - rmm::device_uvector inflate_in( + rmm::device_uvector> inflate_in( + num_compressed_blocks + num_uncompressed_blocks, stream); + rmm::device_uvector> inflate_out( num_compressed_blocks + num_uncompressed_blocks, stream); - rmm::device_uvector inflate_out(num_compressed_blocks, stream); + rmm::device_uvector inflate_stats(num_compressed_blocks, stream); // Parse again to populate the decompression input/output buffers size_t decomp_offset = 0; @@ -408,9 +341,11 @@ rmm::device_buffer reader::impl::decompress_stripe_data( for (size_t i = 0; i < compinfo.size(); ++i) { auto dst_base = static_cast(decomp_data.data()); compinfo[i].uncompressed_data = dst_base + decomp_offset; - compinfo[i].decctl = inflate_in.data() + start_pos; - compinfo[i].decstatus = inflate_out.data() + start_pos; - compinfo[i].copyctl = inflate_in.data() + start_pos_uncomp; + compinfo[i].dec_in_ctl = inflate_in.data() + start_pos; + compinfo[i].dec_out_ctl = inflate_out.data() + start_pos; + compinfo[i].decstatus = {inflate_stats.data() + start_pos, compinfo[i].num_compressed_blocks}; + compinfo[i].copy_in_ctl = inflate_in.data() + start_pos_uncomp; + compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp; stream_info[i].dst_pos = decomp_offset; decomp_offset += compinfo[i].max_uncompressed_size; @@ -428,29 +363,36 @@ rmm::device_buffer reader::impl::decompress_stripe_data( // Dispatch batches of blocks to decompress if (num_compressed_blocks > 0) { - device_span inflate_out_view(inflate_out.data(), num_compressed_blocks); + device_span> inflate_in_view{inflate_in.data(), + num_compressed_blocks}; + device_span> inflate_out_view{inflate_out.data(), num_compressed_blocks}; switch (decompressor->GetKind()) { case orc::ZLIB: - CUDF_CUDA_TRY( - gpuinflate(inflate_in.data(), inflate_out.data(), num_compressed_blocks, 0, stream)); + gpuinflate( + inflate_in_view, inflate_out_view, inflate_stats, gzip_header_included::NO, stream); break; case orc::SNAPPY: if (nvcomp_integration::is_stable_enabled()) { - device_span inflate_in_view{inflate_in.data(), - num_compressed_blocks}; - snappy_decompress(inflate_in_view, inflate_out_view, max_uncomp_block_size, stream); + nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, + inflate_in_view, + inflate_out_view, + inflate_stats, + max_uncomp_block_size, + stream); } else { - CUDF_CUDA_TRY( - gpu_unsnap(inflate_in.data(), inflate_out.data(), num_compressed_blocks, stream)); + gpu_unsnap(inflate_in_view, inflate_out_view, inflate_stats, stream); } break; default: CUDF_FAIL("Unexpected decompression dispatch"); break; } - decompress_check(inflate_out_view, any_block_failure.device_ptr(), stream); + decompress_check(inflate_stats, any_block_failure.device_ptr(), stream); } if (num_uncompressed_blocks > 0) { - CUDF_CUDA_TRY(gpu_copy_uncompressed_blocks( - inflate_in.data() + num_compressed_blocks, num_uncompressed_blocks, stream)); + device_span> copy_in_view{inflate_in.data() + num_compressed_blocks, + num_uncompressed_blocks}; + device_span> copy_out_view{inflate_out.data() + num_compressed_blocks, + num_uncompressed_blocks}; + gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream); } gpu::PostDecompressionReassemble(compinfo.device_ptr(), compinfo.size(), stream); diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index f1d524058d2..3fe623be5b1 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1141,8 +1141,9 @@ __global__ void __launch_bounds__(1024) * * @param[in] strm_desc StripeStream device array [stripe][stream] * @param[in] chunks EncChunk device array [rowgroup][column] - * @param[out] comp_in Per-block compression input parameters - * @param[out] comp_out Per-block compression status + * @param[out] inputs Per-block compression input buffers + * @param[out] outputs Per-block compression output buffers + * @param[out] statuses Per-block compression status * @param[in] compressed_bfr Compression output buffer * @param[in] comp_blk_size Compression block size * @param[in] max_comp_blk_size Max size of any block after compression @@ -1151,8 +1152,9 @@ __global__ void __launch_bounds__(1024) __global__ void __launch_bounds__(256) gpuInitCompressionBlocks(device_2dspan strm_desc, device_2dspan streams, // const? - device_span comp_in, - device_span comp_out, + device_span> inputs, + device_span> outputs, + device_span statuses, uint8_t* compressed_bfr, uint32_t comp_blk_size, uint32_t max_comp_blk_size) @@ -1175,16 +1177,11 @@ __global__ void __launch_bounds__(256) dst = compressed_bfr + ss.bfr_offset; num_blocks = (ss.stream_size > 0) ? (ss.stream_size - 1) / comp_blk_size + 1 : 1; for (uint32_t b = t; b < num_blocks; b += 256) { - gpu_inflate_input_s* blk_in = &comp_in[ss.first_block + b]; - gpu_inflate_status_s* blk_out = &comp_out[ss.first_block + b]; uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size)); - blk_in->srcDevice = src + b * comp_blk_size; - blk_in->srcSize = blk_size; - blk_in->dstDevice = dst + b * (BLOCK_HEADER_SIZE + max_comp_blk_size) + BLOCK_HEADER_SIZE; - blk_in->dstSize = max_comp_blk_size; - blk_out->bytes_written = blk_size; - blk_out->status = 1; - blk_out->reserved = 0; + inputs[ss.first_block + b] = {src + b * comp_blk_size, blk_size}; + outputs[ss.first_block + b] = { + dst + b * (BLOCK_HEADER_SIZE + max_comp_blk_size) + BLOCK_HEADER_SIZE, max_comp_blk_size}; + statuses[ss.first_block + b] = {blk_size, 1, 0}; } } @@ -1194,8 +1191,9 @@ __global__ void __launch_bounds__(256) * * @param[in,out] strm_desc StripeStream device array [stripe][stream] * @param[in] chunks EncChunk device array [rowgroup][column] - * @param[in] comp_in Per-block compression input parameters - * @param[in] comp_out Per-block compression status + * @param[out] inputs Per-block compression input buffers + * @param[out] outputs Per-block compression output buffers + * @param[out] statuses Per-block compression status * @param[in] compressed_bfr Compression output buffer * @param[in] comp_blk_size Compression block size * @param[in] max_comp_blk_size Max size of any block after compression @@ -1203,8 +1201,9 @@ __global__ void __launch_bounds__(256) // blockDim {1024,1,1} __global__ void __launch_bounds__(1024) gpuCompactCompressedBlocks(device_2dspan strm_desc, - device_span comp_in, - device_span comp_out, + device_span const> inputs, + device_span const> outputs, + device_span statuses, uint8_t* compressed_bfr, uint32_t comp_blk_size, uint32_t max_comp_blk_size) @@ -1228,21 +1227,21 @@ __global__ void __launch_bounds__(1024) b = 0; do { if (t == 0) { - gpu_inflate_input_s* blk_in = &comp_in[ss.first_block + b]; - gpu_inflate_status_s* blk_out = &comp_out[ss.first_block + b]; - uint32_t src_len = + auto const src_len = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size)); - uint32_t dst_len = (blk_out->status == 0) ? blk_out->bytes_written : src_len; - uint32_t blk_size24; + auto dst_len = (statuses[ss.first_block + b].status == 0) + ? statuses[ss.first_block + b].bytes_written + : src_len; + uint32_t blk_size24{}; if (dst_len >= src_len) { // Copy from uncompressed source - src = static_cast(blk_in->srcDevice); - blk_out->bytes_written = src_len; - dst_len = src_len; - blk_size24 = dst_len * 2 + 1; + src = inputs[ss.first_block + b].data(); + statuses[ss.first_block + b].bytes_written = src_len; + dst_len = src_len; + blk_size24 = dst_len * 2 + 1; } else { // Compressed block - src = static_cast(blk_in->dstDevice); + src = outputs[ss.first_block + b].data(); blk_size24 = dst_len * 2 + 0; } dst[0] = static_cast(blk_size24 >> 0); @@ -1311,14 +1310,21 @@ void CompressOrcDataStreams(uint8_t* compressed_data, uint32_t max_comp_blk_size, device_2dspan strm_desc, device_2dspan enc_streams, - device_span comp_in, - device_span comp_out, + device_span> comp_in, + device_span> comp_out, + device_span comp_stat, rmm::cuda_stream_view stream) { dim3 dim_block_init(256, 1); dim3 dim_grid(strm_desc.size().first, strm_desc.size().second); - gpuInitCompressionBlocks<<>>( - strm_desc, enc_streams, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size); + gpuInitCompressionBlocks<<>>(strm_desc, + enc_streams, + comp_in, + comp_out, + comp_stat, + compressed_data, + comp_blk_size, + max_comp_blk_size); if (compression == SNAPPY) { if (detail::nvcomp_integration::is_stable_enabled()) { try { @@ -1336,15 +1342,18 @@ void CompressOrcDataStreams(uint8_t* compressed_data, rmm::device_uvector compressed_bytes_written(num_compressed_blocks, stream); auto comp_it = thrust::make_zip_iterator(uncompressed_data_ptrs.begin(), - uncompressed_data_sizes.begin(), - compressed_data_ptrs.begin()); + uncompressed_data_sizes.begin()); + thrust::transform( + rmm::exec_policy(stream), + comp_in.begin(), + comp_in.end(), + comp_it, + [] __device__(auto const& in) { return thrust::make_tuple(in.data(), in.size()); }); thrust::transform(rmm::exec_policy(stream), - comp_in.begin(), - comp_in.end(), - comp_it, - [] __device__(gpu_inflate_input_s in) { - return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice); - }); + comp_out.begin(), + comp_out.end(), + compressed_data_ptrs.begin(), + [] __device__(auto const& out) { return out.data(); }); nvcomp_status = nvcompBatchedSnappyCompressAsync(uncompressed_data_ptrs.data(), uncompressed_data_sizes.data(), max_comp_blk_size, @@ -1361,9 +1370,9 @@ void CompressOrcDataStreams(uint8_t* compressed_data, thrust::transform(rmm::exec_policy(stream), compressed_bytes_written.begin(), compressed_bytes_written.end(), - comp_out.begin(), + comp_stat.begin(), [] __device__(size_t size) { - gpu_inflate_status_s status{}; + decompress_status status{}; status.bytes_written = size; return status; }); @@ -1371,18 +1380,18 @@ void CompressOrcDataStreams(uint8_t* compressed_data, // If we reach this then there was an error in compressing so set an error status for each // block thrust::for_each(rmm::exec_policy(stream), - comp_out.begin(), - comp_out.end(), - [] __device__(gpu_inflate_status_s & stat) { stat.status = 1; }); + comp_stat.begin(), + comp_stat.end(), + [] __device__(decompress_status & stat) { stat.status = 1; }); }; } else { - gpu_snap(comp_in.data(), comp_out.data(), num_compressed_blocks, stream); + gpu_snap(comp_in, comp_out, comp_stat, stream); } } dim3 dim_block_compact(1024, 1); gpuCompactCompressedBlocks<<>>( - strm_desc, comp_in, comp_out, compressed_data, comp_blk_size, max_comp_blk_size); + strm_desc, comp_in, comp_out, comp_stat, compressed_data, comp_blk_size, max_comp_blk_size); } } // namespace gpu diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 276a1f49abf..e44ca10922f 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -26,9 +26,16 @@ namespace cudf { namespace io { namespace orc { namespace gpu { + +struct comp_in_out { + uint8_t const* in_ptr; + size_t in_size; + uint8_t* out_ptr; + size_t out_size; +}; struct compressed_stream_s { CompressedStreamInfo info; - gpu_inflate_input_s ctl; + comp_in_out ctl; }; // blockDim {128,1,1} @@ -57,7 +64,8 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0); uint32_t is_uncompressed = block_len & 1; uint32_t uncompressed_size; - gpu_inflate_input_s* init_ctl = nullptr; + device_span* init_in_ctl = nullptr; + device_span* init_out_ctl = nullptr; block_len >>= 1; cur += BLOCK_HEADER_SIZE; if (block_len > block_size || cur + block_len > end) { @@ -82,27 +90,34 @@ extern "C" __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeDat uncompressed[max_uncompressed_size + lane_id] = cur[lane_id]; } } else { - init_ctl = s->info.copyctl; - init_ctl = (init_ctl && num_uncompressed_blocks < s->info.num_uncompressed_blocks) - ? &init_ctl[num_uncompressed_blocks] - : nullptr; + init_in_ctl = + (s->info.copy_in_ctl && num_uncompressed_blocks < s->info.num_uncompressed_blocks) + ? &s->info.copy_in_ctl[num_uncompressed_blocks] + : nullptr; + init_out_ctl = + (s->info.copy_out_ctl && num_uncompressed_blocks < s->info.num_uncompressed_blocks) + ? &s->info.copy_out_ctl[num_uncompressed_blocks] + : nullptr; num_uncompressed_blocks++; } } else { - init_ctl = s->info.decctl; - init_ctl = (init_ctl && num_compressed_blocks < s->info.num_compressed_blocks) - ? &init_ctl[num_compressed_blocks] - : nullptr; + init_in_ctl = (s->info.dec_in_ctl && num_compressed_blocks < s->info.num_compressed_blocks) + ? &s->info.dec_in_ctl[num_compressed_blocks] + : nullptr; + init_out_ctl = + (s->info.dec_out_ctl && num_compressed_blocks < s->info.num_compressed_blocks) + ? &s->info.dec_out_ctl[num_compressed_blocks] + : nullptr; num_compressed_blocks++; } - if (!lane_id && init_ctl) { - s->ctl.srcDevice = const_cast(cur); - s->ctl.srcSize = block_len; - s->ctl.dstDevice = uncompressed + max_uncompressed_size; - s->ctl.dstSize = uncompressed_size; + if (!lane_id && init_in_ctl) { + s->ctl = {cur, block_len, uncompressed + max_uncompressed_size, uncompressed_size}; } __syncwarp(); - if (init_ctl && lane_id == 0) *init_ctl = s->ctl; + if (init_in_ctl && lane_id == 0) { + *init_in_ctl = {s->ctl.in_ptr, s->ctl.in_size}; + *init_out_ctl = {s->ctl.out_ptr, s->ctl.out_size}; + } cur += block_len; max_uncompressed_size += uncompressed_size; max_uncompressed_block_size = max(max_uncompressed_block_size, uncompressed_size); @@ -137,14 +152,14 @@ extern "C" __global__ void __launch_bounds__(128, 8) s->info.num_compressed_blocks + s->info.num_uncompressed_blocks > 0 && s->info.max_uncompressed_size > 0) { // Walk through the compressed blocks - const uint8_t* cur = s->info.compressed_data; - const uint8_t* end = cur + s->info.compressed_data_size; - const gpu_inflate_input_s* dec_in = s->info.decctl; - const gpu_inflate_status_s* dec_out = s->info.decstatus; - uint8_t* uncompressed_actual = s->info.uncompressed_data; - uint8_t* uncompressed_estimated = uncompressed_actual; - uint32_t num_compressed_blocks = 0; - uint32_t max_compressed_blocks = s->info.num_compressed_blocks; + const uint8_t* cur = s->info.compressed_data; + const uint8_t* end = cur + s->info.compressed_data_size; + auto dec_out = s->info.dec_out_ctl; + auto dec_status = s->info.decstatus; + uint8_t* uncompressed_actual = s->info.uncompressed_data; + uint8_t* uncompressed_estimated = uncompressed_actual; + uint32_t num_compressed_blocks = 0; + uint32_t max_compressed_blocks = s->info.num_compressed_blocks; while (cur + BLOCK_HEADER_SIZE < end) { uint32_t block_len = shuffle((lane_id == 0) ? cur[0] | (cur[1] << 8) | (cur[2] << 16) : 0); @@ -158,14 +173,14 @@ extern "C" __global__ void __launch_bounds__(128, 8) uncompressed_size_actual = block_len; } else { if (num_compressed_blocks > max_compressed_blocks) { break; } - if (shuffle((lane_id == 0) ? dec_out[num_compressed_blocks].status : 0) != 0) { + if (shuffle((lane_id == 0) ? dec_status[num_compressed_blocks].status : 0) != 0) { // Decompression failed, not much point in doing anything else break; } - uncompressed_size_est = - shuffle((lane_id == 0) ? *(const uint32_t*)&dec_in[num_compressed_blocks].dstSize : 0); - uncompressed_size_actual = shuffle( - (lane_id == 0) ? *(const uint32_t*)&dec_out[num_compressed_blocks].bytes_written : 0); + uint32_t const dst_size = dec_out[num_compressed_blocks].size(); + uncompressed_size_est = shuffle((lane_id == 0) ? dst_size : 0); + uint32_t const bytes_written = dec_status[num_compressed_blocks].bytes_written; + uncompressed_size_actual = shuffle((lane_id == 0) ? bytes_written : 0); } // In practice, this should never happen with a well-behaved writer, as we would expect the // uncompressed size to always be equal to the compression block size except for the last @@ -360,11 +375,11 @@ static __device__ void gpuMapRowIndexToUncompressed(rowindex_state_s* s, if (strm_len > 0) { int32_t compressed_offset = (t < num_rowgroups) ? s->compressed_offset[t][ci_id] : 0; if (compressed_offset > 0) { - const uint8_t* start = s->strm_info[ci_id].compressed_data; - const uint8_t* cur = start; - const uint8_t* end = cur + s->strm_info[ci_id].compressed_data_size; - gpu_inflate_status_s* decstatus = s->strm_info[ci_id].decstatus; - uint32_t uncomp_offset = 0; + const uint8_t* start = s->strm_info[ci_id].compressed_data; + const uint8_t* cur = start; + const uint8_t* end = cur + s->strm_info[ci_id].compressed_data_size; + auto decstatus = s->strm_info[ci_id].decstatus.data(); + uint32_t uncomp_offset = 0; for (;;) { uint32_t block_len, is_uncompressed; diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 779d0390751..ecd2d6f6ec0 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1314,7 +1314,7 @@ void writer::impl::write_index_stream(int32_t stripe_id, file_segmentation const& segmentation, host_2dspan enc_streams, host_2dspan strm_desc, - host_span comp_out, + host_span comp_out, std::vector const& rg_stats, StripeInformation* stripe, orc_streams* streams, @@ -2050,8 +2050,9 @@ void writer::impl::write(table_view const& table) // Compress the data streams rmm::device_buffer compressed_data(compressed_bfr_size, stream); - hostdevice_vector comp_out(num_compressed_blocks, stream); - hostdevice_vector comp_in(num_compressed_blocks, stream); + hostdevice_vector> comp_in(num_compressed_blocks, stream); + hostdevice_vector> comp_out(num_compressed_blocks, stream); + hostdevice_vector comp_stats(num_compressed_blocks, stream); if (compression_kind_ != NONE) { strm_descs.host_to_device(stream); gpu::CompressOrcDataStreams(static_cast(compressed_data.data()), @@ -2063,9 +2064,10 @@ void writer::impl::write(table_view const& table) enc_data.streams, comp_in, comp_out, + comp_stats, stream); strm_descs.device_to_host(stream); - comp_out.device_to_host(stream, true); + comp_stats.device_to_host(stream, true); } ProtobufWriter pbw_(&buffer_); @@ -2097,7 +2099,7 @@ void writer::impl::write(table_view const& table) segmentation, enc_data.streams, strm_descs, - comp_out, + comp_stats, intermediate_stats.rowgroup_blobs, &stripe, &streams, diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 5f981793762..d823c73007f 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -386,7 +386,7 @@ class writer::impl { file_segmentation const& segmentation, host_2dspan enc_streams, host_2dspan strm_desc, - host_span comp_out, + host_span comp_out, std::vector const& rg_stats, StripeInformation* stripe, orc_streams* streams, diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 61bd29399cd..f05f0af2a79 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -81,8 +81,6 @@ struct page_enc_state_s { EncPage page; EncColumnChunk ck; parquet_column_device_view col; - gpu_inflate_input_s comp_in; - gpu_inflate_status_s comp_stat; uint16_t vals[rle_buffer_size]; }; @@ -750,8 +748,9 @@ static __device__ std::pair convert_nanoseconds(timesta template __global__ void __launch_bounds__(128, 8) gpuEncodePages(device_span pages, - device_span comp_in, - device_span comp_stat) + device_span> comp_in, + device_span> comp_out, + device_span comp_stats) { __shared__ __align__(8) page_enc_state_s state_g; using block_scan = cub::BlockScan; @@ -761,6 +760,7 @@ __global__ void __launch_bounds__(128, 8) uint32_t t = threadIdx.x; if (t == 0) { + state_g = page_enc_state_s{}; s->page = pages[blockIdx.x]; s->ck = *s->page.chunk; s->col = *s->ck.col_desc; @@ -1085,21 +1085,14 @@ __global__ void __launch_bounds__(128, 8) auto actual_data_size = static_cast(s->cur - base); uint32_t compressed_bfr_size = GetMaxCompressedBfrSize(actual_data_size); s->page.max_data_size = actual_data_size; - s->comp_in.srcDevice = base; - s->comp_in.srcSize = actual_data_size; - s->comp_in.dstDevice = s->page.compressed_data + s->page.max_hdr_size; - s->comp_in.dstSize = compressed_bfr_size; - s->comp_stat.bytes_written = 0; - s->comp_stat.status = ~0; - s->comp_stat.reserved = 0; - } - __syncthreads(); - if (t == 0) { + if (not comp_in.empty()) { + comp_in[blockIdx.x] = {base, actual_data_size}; + comp_out[blockIdx.x] = {s->page.compressed_data + s->page.max_hdr_size, compressed_bfr_size}; + } pages[blockIdx.x] = s->page; - if (not comp_in.empty()) comp_in[blockIdx.x] = s->comp_in; - if (not comp_stat.empty()) { - comp_stat[blockIdx.x] = s->comp_stat; - pages[blockIdx.x].comp_stat = &comp_stat[blockIdx.x]; + if (not comp_stats.empty()) { + comp_stats[blockIdx.x] = {0, ~0u}; + pages[blockIdx.x].comp_stat = &comp_stats[blockIdx.x]; } } } @@ -1317,7 +1310,7 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start, // blockDim(128, 1, 1) __global__ void __launch_bounds__(128) gpuEncodePageHeaders(device_span pages, - device_span comp_stat, + device_span comp_stat, device_span page_stats, const statistics_chunk* chunk_stats) { @@ -1946,14 +1939,15 @@ void InitEncoderPages(device_2dspan chunks, } void EncodePages(device_span pages, - device_span comp_in, - device_span comp_stat, + device_span> comp_in, + device_span> comp_out, + device_span comp_stats, rmm::cuda_stream_view stream) { auto num_pages = pages.size(); // A page is part of one column. This is launching 1 block per page. 1 block will exclusively // deal with one datatype. - gpuEncodePages<128><<>>(pages, comp_in, comp_stat); + gpuEncodePages<128><<>>(pages, comp_in, comp_out, comp_stats); } void DecideCompression(device_span chunks, rmm::cuda_stream_view stream) @@ -1962,7 +1956,7 @@ void DecideCompression(device_span chunks, rmm::cuda_stream_view } void EncodePageHeaders(device_span pages, - device_span comp_stat, + device_span comp_stats, device_span page_stats, const statistics_chunk* chunk_stats, rmm::cuda_stream_view stream) @@ -1970,7 +1964,7 @@ void EncodePageHeaders(device_span pages, // TODO: single thread task. No need for 128 threads/block. Earlier it used to employ rest of the // threads to coop load structs gpuEncodePageHeaders<<>>( - pages, comp_stat, page_stats, chunk_stats); + pages, comp_stats, page_stats, chunk_stats); } void GatherPages(device_span chunks, diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 53b82c73a35..057b9a87214 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -378,7 +378,7 @@ struct EncPage { uint32_t num_leaf_values; //!< Values in page. Different from num_rows in case of nested types uint32_t num_values; //!< Number of def/rep level values in page. Includes null/empty elements in //!< non-leaf levels - gpu_inflate_status_s* comp_stat; //!< Ptr to compression status + decompress_status* comp_stat; //!< Ptr to compression status }; /** @@ -584,13 +584,15 @@ void InitEncoderPages(cudf::detail::device_2dspan chunks, * @brief Launches kernel for packing column data into parquet pages * * @param[in,out] pages Device array of EncPages (unordered) - * @param[out] comp_in Optionally initializes compressor input params - * @param[out] comp_out Optionally initializes compressor output params + * @param[out] comp_in Compressor input buffers + * @param[out] comp_in Compressor output buffers + * @param[out] comp_stats Compressor statuses * @param[in] stream CUDA stream to use, default 0 */ void EncodePages(device_span pages, - device_span comp_in, - device_span comp_out, + device_span> comp_in, + device_span> comp_out, + device_span comp_stats, rmm::cuda_stream_view stream); /** @@ -605,13 +607,13 @@ void DecideCompression(device_span chunks, rmm::cuda_stream_view * @brief Launches kernel to encode page headers * * @param[in,out] pages Device array of EncPages - * @param[in] comp_out Compressor status or nullptr if no compression + * @param[in] comp_stats Compressor status * @param[in] page_stats Optional page-level statistics to be included in page header * @param[in] chunk_stats Optional chunk-level statistics to be encoded * @param[in] stream CUDA stream to use, default 0 */ void EncodePageHeaders(device_span pages, - device_span comp_out, + device_span comp_stats, device_span page_stats, const statistics_chunk* chunk_stats, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index cfca0bad518..a40993ee2dd 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -24,6 +24,7 @@ #include "compact_protocol_reader.hpp" #include +#include #include #include @@ -38,10 +39,9 @@ #include #include -#include - #include #include +#include #include #include @@ -1050,96 +1050,13 @@ void reader::impl::decode_page_headers(hostdevice_vector& pages.device_to_host(stream, true); } -__global__ void decompress_check_kernel(device_span stats, - bool* any_block_failure) -{ - auto tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < stats.size()) { - if (stats[tid].status != 0) { - *any_block_failure = true; // Doesn't need to be atomic - } - } -} - -void decompress_check(device_span stats, - bool* any_block_failure, - rmm::cuda_stream_view stream) -{ - if (stats.empty()) { return; } // early exit for empty stats - - dim3 block(128); - dim3 grid(cudf::util::div_rounding_up_safe(stats.size(), static_cast(block.x))); - decompress_check_kernel<<>>(stats, any_block_failure); -} - -__global__ void convert_nvcomp_status(device_span nvcomp_stats, - device_span stats) -{ - auto tid = blockIdx.x * blockDim.x + threadIdx.x; - if (tid < stats.size()) { - stats[tid].status = nvcomp_stats[tid] == nvcompStatus_t::nvcompSuccess ? 0 : 1; - } -} - -void snappy_decompress(device_span comp_in, - device_span comp_stat, - size_t max_uncomp_page_size, - rmm::cuda_stream_view stream) +void decompress_check(device_span stats, rmm::cuda_stream_view stream) { - size_t num_comp_pages = comp_in.size(); - size_t temp_size; - - nvcompStatus_t nvcomp_status = - nvcompBatchedSnappyDecompressGetTempSize(num_comp_pages, max_uncomp_page_size, &temp_size); - CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, - "Unable to get scratch size for snappy decompression"); - - // Not needed now but nvcomp API makes no promises about future - rmm::device_buffer scratch(temp_size, stream); - // Analogous to comp_in.srcDevice - rmm::device_uvector compressed_data_ptrs(num_comp_pages, stream); - // Analogous to comp_in.srcSize - rmm::device_uvector compressed_data_sizes(num_comp_pages, stream); - // Analogous to comp_in.dstDevice - rmm::device_uvector uncompressed_data_ptrs(num_comp_pages, stream); - // Analogous to comp_in.dstSize - rmm::device_uvector uncompressed_data_sizes(num_comp_pages, stream); - - // Analogous to comp_stat.bytes_written - rmm::device_uvector actual_uncompressed_data_sizes(num_comp_pages, stream); - // Convertible to comp_stat.status - rmm::device_uvector statuses(num_comp_pages, stream); - device_span statuses_span(statuses.data(), statuses.size()); - - // Prepare the vectors - auto comp_it = thrust::make_zip_iterator(compressed_data_ptrs.begin(), - compressed_data_sizes.begin(), - uncompressed_data_ptrs.begin(), - uncompressed_data_sizes.data()); - thrust::transform(rmm::exec_policy(stream), - comp_in.begin(), - comp_in.end(), - comp_it, - [] __device__(gpu_inflate_input_s in) { - return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice, in.dstSize); - }); - - nvcomp_status = nvcompBatchedSnappyDecompressAsync(compressed_data_ptrs.data(), - compressed_data_sizes.data(), - uncompressed_data_sizes.data(), - actual_uncompressed_data_sizes.data(), - num_comp_pages, - scratch.data(), - scratch.size(), - uncompressed_data_ptrs.data(), - statuses.data(), - stream.value()); - CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess, - "unable to perform snappy decompression"); - - dim3 block(128); - dim3 grid(cudf::util::div_rounding_up_safe(num_comp_pages, static_cast(block.x))); - convert_nvcomp_status<<>>(statuses_span, comp_stat); + CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream), + stats.begin(), + stats.end(), + [] __device__(auto const& stat) { return stat.status == 0; }), + "Error during decompression"); } /** @@ -1175,9 +1092,9 @@ rmm::device_buffer reader::impl::decompress_page_data( int32_t max_decompressed_size; }; - std::array codecs{codec_stats{parquet::GZIP, 0, 0}, - codec_stats{parquet::SNAPPY, 0, 0}, - codec_stats{parquet::BROTLI, 0, 0}}; + std::array codecs{codec_stats{parquet::GZIP, 0, 0}, + codec_stats{parquet::SNAPPY, 0, 0}, + codec_stats{parquet::BROTLI, 0, 0}}; auto is_codec_supported = [&codecs](int8_t codec) { if (codec == parquet::UNCOMPRESSED) return true; @@ -1207,91 +1124,73 @@ rmm::device_buffer reader::impl::decompress_page_data( // Dispatch batches of pages to decompress for each codec rmm::device_buffer decomp_pages(total_decomp_size, stream); - hostdevice_vector inflate_in(0, num_comp_pages, stream); - hostdevice_vector inflate_out(0, num_comp_pages, stream); - hostdevice_vector any_block_failure(1, stream); - any_block_failure[0] = false; - any_block_failure.host_to_device(stream); + std::vector> comp_in; + comp_in.reserve(num_comp_pages); + std::vector> comp_out; + comp_out.reserve(num_comp_pages); - device_span inflate_in_view(inflate_in.device_ptr(), inflate_in.size()); - device_span inflate_out_view(inflate_out.device_ptr(), inflate_out.size()); + rmm::device_uvector comp_stats(num_comp_pages, stream); + thrust::fill(rmm::exec_policy(stream), + comp_stats.begin(), + comp_stats.end(), + decompress_status{0, static_cast(-1000), 0}); size_t decomp_offset = 0; - int32_t argc = 0; + int32_t start_pos = 0; for (const auto& codec : codecs) { - if (codec.num_pages > 0) { - int32_t start_pos = argc; - - for_each_codec_page(codec.compression_type, [&](size_t page) { - auto dst_base = static_cast(decomp_pages.data()); - inflate_in[argc].srcDevice = pages[page].page_data; - inflate_in[argc].srcSize = pages[page].compressed_page_size; - inflate_in[argc].dstDevice = dst_base + decomp_offset; - inflate_in[argc].dstSize = pages[page].uncompressed_page_size; - - inflate_out[argc].bytes_written = 0; - inflate_out[argc].status = static_cast(-1000); - inflate_out[argc].reserved = 0; - - pages[page].page_data = static_cast(inflate_in[argc].dstDevice); - decomp_offset += inflate_in[argc].dstSize; - argc++; - }); + if (codec.num_pages == 0) { continue; } - CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_in.device_ptr(start_pos), - inflate_in.host_ptr(start_pos), - sizeof(decltype(inflate_in)::value_type) * (argc - start_pos), - cudaMemcpyHostToDevice, - stream.value())); - CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_out.device_ptr(start_pos), - inflate_out.host_ptr(start_pos), - sizeof(decltype(inflate_out)::value_type) * (argc - start_pos), - cudaMemcpyHostToDevice, - stream.value())); - - switch (codec.compression_type) { - case parquet::GZIP: - CUDF_CUDA_TRY(gpuinflate(inflate_in.device_ptr(start_pos), - inflate_out.device_ptr(start_pos), - argc - start_pos, - 1, - stream)) - break; - case parquet::SNAPPY: - if (nvcomp_integration::is_stable_enabled()) { - snappy_decompress(inflate_in_view.subspan(start_pos, argc - start_pos), - inflate_out_view.subspan(start_pos, argc - start_pos), - codec.max_decompressed_size, - stream); - } else { - CUDF_CUDA_TRY(gpu_unsnap(inflate_in.device_ptr(start_pos), - inflate_out.device_ptr(start_pos), - argc - start_pos, - stream)); - } - break; - case parquet::BROTLI: - CUDF_CUDA_TRY(gpu_debrotli(inflate_in.device_ptr(start_pos), - inflate_out.device_ptr(start_pos), - debrotli_scratch.data(), - debrotli_scratch.size(), - argc - start_pos, - stream)); - break; - default: CUDF_FAIL("Unexpected decompression dispatch"); break; - } - CUDF_CUDA_TRY(cudaMemcpyAsync(inflate_out.host_ptr(start_pos), - inflate_out.device_ptr(start_pos), - sizeof(decltype(inflate_out)::value_type) * (argc - start_pos), - cudaMemcpyDeviceToHost, - stream.value())); + for_each_codec_page(codec.compression_type, [&](size_t page) { + auto dst_base = static_cast(decomp_pages.data()); + comp_in.emplace_back(pages[page].page_data, + static_cast(pages[page].compressed_page_size)); + comp_out.emplace_back(dst_base + decomp_offset, + static_cast(pages[page].uncompressed_page_size)); + + pages[page].page_data = static_cast(comp_out.back().data()); + decomp_offset += comp_out.back().size(); + }); + + host_span const> comp_in_view{comp_in.data() + start_pos, + codec.num_pages}; + auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, stream); + host_span const> comp_out_view(comp_out.data() + start_pos, + codec.num_pages); + auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, stream); + device_span d_comp_stats_view(comp_stats.data() + start_pos, + codec.num_pages); + + switch (codec.compression_type) { + case parquet::GZIP: + gpuinflate(d_comp_in, d_comp_out, d_comp_stats_view, gzip_header_included::YES, stream); + break; + case parquet::SNAPPY: + if (nvcomp_integration::is_stable_enabled()) { + nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, + d_comp_in, + d_comp_out, + d_comp_stats_view, + codec.max_decompressed_size, + stream); + } else { + gpu_unsnap(d_comp_in, d_comp_out, d_comp_stats_view, stream); + } + break; + case parquet::BROTLI: + gpu_debrotli(d_comp_in, + d_comp_out, + d_comp_stats_view, + debrotli_scratch.data(), + debrotli_scratch.size(), + stream); + break; + default: CUDF_FAIL("Unexpected decompression dispatch"); break; } + start_pos += codec.num_pages; } - decompress_check(inflate_out_view, any_block_failure.device_ptr(), stream); - any_block_failure.device_to_host(stream, true); // synchronizes stream - CUDF_EXPECTS(not any_block_failure[0], "Error during decompression"); + decompress_check(comp_stats, stream); // Update the page information in device memory with the updated value of // page_data; it now points to the uncompressed data buffer diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 75a50714407..dbbd39fb508 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -984,8 +984,9 @@ void writer::impl::init_encoder_pages(hostdevice_2dvector& stream.synchronize(); } -void snappy_compress(device_span comp_in, - device_span comp_stat, +void snappy_compress(device_span const> comp_in, + device_span const> comp_out, + device_span comp_stats, size_t max_page_uncomp_data_size, rmm::cuda_stream_view stream) { @@ -1012,16 +1013,20 @@ void snappy_compress(device_span comp_in, // the space allocated unless one uses the API nvcompBatchedSnappyCompressGetOutputSize() // Prepare the vectors - auto comp_it = thrust::make_zip_iterator(uncompressed_data_ptrs.begin(), - uncompressed_data_sizes.begin(), - compressed_data_ptrs.begin()); + auto comp_it = + thrust::make_zip_iterator(uncompressed_data_ptrs.begin(), uncompressed_data_sizes.begin()); + thrust::transform( + rmm::exec_policy(stream), + comp_in.begin(), + comp_in.end(), + comp_it, + [] __device__(auto const& in) { return thrust::make_tuple(in.data(), in.size()); }); + thrust::transform(rmm::exec_policy(stream), - comp_in.begin(), - comp_in.end(), - comp_it, - [] __device__(gpu_inflate_input_s in) { - return thrust::make_tuple(in.srcDevice, in.srcSize, in.dstDevice); - }); + comp_out.begin(), + comp_out.end(), + compressed_data_ptrs.begin(), + [] __device__(auto const& out) { return out.data(); }); nvcomp_status = nvcompBatchedSnappyCompressAsync(uncompressed_data_ptrs.data(), uncompressed_data_sizes.data(), max_page_uncomp_data_size, @@ -1041,9 +1046,9 @@ void snappy_compress(device_span comp_in, thrust::transform(rmm::exec_policy(stream), compressed_bytes_written.begin(), compressed_bytes_written.end(), - comp_stat.begin(), + comp_stats.begin(), [] __device__(size_t size) { - gpu_inflate_status_s status{}; + decompress_status status{}; status.bytes_written = size; return status; }); @@ -1051,9 +1056,9 @@ void snappy_compress(device_span comp_in, } catch (...) { // If we reach this then there was an error in compressing so set an error status for each page thrust::for_each(rmm::exec_policy(stream), - comp_stat.begin(), - comp_stat.end(), - [] __device__(gpu_inflate_status_s & stat) { stat.status = 1; }); + comp_stats.begin(), + comp_stats.end(), + [] __device__(decompress_status & stat) { stat.status = 1; }); }; } @@ -1077,19 +1082,17 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks uint32_t max_comp_pages = (compression_ != parquet::Compression::UNCOMPRESSED) ? pages_in_batch : 0; - rmm::device_uvector compression_input(max_comp_pages, stream); - rmm::device_uvector compression_status(max_comp_pages, stream); - - device_span comp_in{compression_input.data(), compression_input.size()}; - device_span comp_stat{compression_status.data(), compression_status.size()}; + rmm::device_uvector> comp_in(max_comp_pages, stream); + rmm::device_uvector> comp_out(max_comp_pages, stream); + rmm::device_uvector comp_stats(max_comp_pages, stream); - gpu::EncodePages(batch_pages, comp_in, comp_stat, stream); + gpu::EncodePages(batch_pages, comp_in, comp_out, comp_stats, stream); switch (compression_) { case parquet::Compression::SNAPPY: if (nvcomp_integration::is_stable_enabled()) { - snappy_compress(comp_in, comp_stat, max_page_uncomp_data_size, stream); + snappy_compress(comp_in, comp_out, comp_stats, max_page_uncomp_data_size, stream); } else { - CUDF_CUDA_TRY(gpu_snap(comp_in.data(), comp_stat.data(), pages_in_batch, stream)); + gpu_snap(comp_in, comp_out, comp_stats, stream); } break; default: break; @@ -1098,7 +1101,7 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks // chunk-level auto d_chunks_in_batch = chunks.device_view().subspan(first_rowgroup, rowgroups_in_batch); DecideCompression(d_chunks_in_batch.flat_view(), stream); - EncodePageHeaders(batch_pages, comp_stat, batch_pages_stats, chunk_stats, stream); + EncodePageHeaders(batch_pages, comp_stats, batch_pages_stats, chunk_stats, stream); GatherPages(d_chunks_in_batch.flat_view(), pages, stream); auto h_chunks_in_batch = chunks.host_view().subspan(first_rowgroup, rowgroups_in_batch); diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index a754f7cf7d3..30c7b6ec326 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -51,10 +51,10 @@ class hostdevice_vector { } explicit hostdevice_vector(size_t initial_size, size_t max_size, rmm::cuda_stream_view stream) - : num_elements(initial_size), max_elements(max_size) + : max_elements(max_size), num_elements(initial_size) { if (max_elements != 0) { - CUDF_CUDA_TRY(cudaMallocHost(&h_data, sizeof(T) * max_elements)); + CUDF_CUDA_TRY(cudaMallocHost(reinterpret_cast(&h_data), sizeof(T) * max_elements)); d_data.resize(sizeof(T) * max_elements, stream); } } @@ -62,7 +62,7 @@ class hostdevice_vector { ~hostdevice_vector() { if (max_elements != 0) { - auto const free_result = cudaFreeHost(h_data); + [[maybe_unused]] auto const free_result = cudaFreeHost(h_data); assert(free_result == cudaSuccess); } } diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp index dd00b201df9..a325cadf6a5 100644 --- a/cpp/tests/io/comp/decomp_test.cpp +++ b/cpp/tests/io/comp/decomp_test.cpp @@ -15,6 +15,7 @@ */ #include +#include #include @@ -24,6 +25,8 @@ #include +using cudf::device_span; + /** * @brief Base test fixture for decompression * @@ -32,19 +35,6 @@ */ template struct DecompressTest : public cudf::test::BaseFixture { - void SetUp() override - { - ASSERT_CUDA_SUCCEEDED(cudaMallocHost((void**)&inf_args, sizeof(cudf::io::gpu_inflate_input_s))); - ASSERT_CUDA_SUCCEEDED( - cudaMallocHost((void**)&inf_stat, sizeof(cudf::io::gpu_inflate_status_s))); - } - - void TearDown() override - { - ASSERT_CUDA_SUCCEEDED(cudaFreeHost(inf_stat)); - ASSERT_CUDA_SUCCEEDED(cudaFreeHost(inf_args)); - } - std::vector vector_from_string(const char* str) const { return std::vector(reinterpret_cast(str), @@ -55,49 +45,43 @@ struct DecompressTest : public cudf::test::BaseFixture { const uint8_t* compressed, size_t compressed_size) { - rmm::device_buffer src{compressed, compressed_size, rmm::cuda_stream_default}; - rmm::device_buffer dst{decompressed->size(), rmm::cuda_stream_default}; - - inf_args->srcDevice = static_cast(src.data()); - inf_args->dstDevice = static_cast(dst.data()); - inf_args->srcSize = src.size(); - inf_args->dstSize = dst.size(); - rmm::device_uvector d_inf_args(1, rmm::cuda_stream_default); - rmm::device_uvector d_inf_stat(1, rmm::cuda_stream_default); - ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_inf_args.data(), - inf_args, - sizeof(cudf::io::gpu_inflate_input_s), - cudaMemcpyHostToDevice, - 0)); - ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(d_inf_stat.data(), - inf_stat, - sizeof(cudf::io::gpu_inflate_status_s), - cudaMemcpyHostToDevice, - 0)); - ASSERT_CUDA_SUCCEEDED( - static_cast(this)->dispatch(d_inf_args.data(), d_inf_stat.data())); - ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync(inf_stat, - d_inf_stat.data(), - sizeof(cudf::io::gpu_inflate_status_s), - cudaMemcpyDeviceToHost, - 0)); - ASSERT_CUDA_SUCCEEDED(cudaMemcpyAsync( - decompressed->data(), inf_args->dstDevice, inf_args->dstSize, cudaMemcpyDeviceToHost, 0)); - ASSERT_CUDA_SUCCEEDED(cudaStreamSynchronize(0)); + auto stream = rmm::cuda_stream_default; + rmm::device_buffer src{compressed, compressed_size, stream}; + rmm::device_uvector dst{decompressed->size(), stream}; + + hostdevice_vector> inf_in(1, stream); + inf_in[0] = {static_cast(src.data()), src.size()}; + inf_in.host_to_device(stream); + + hostdevice_vector> inf_out(1, stream); + inf_out[0] = dst; + inf_out.host_to_device(stream); + + hostdevice_vector inf_stat(1, stream); + inf_stat[0] = {}; + inf_stat.host_to_device(stream); + + static_cast(this)->dispatch(inf_in, inf_out, inf_stat); + cudaMemcpyAsync( + decompressed->data(), dst.data(), dst.size(), cudaMemcpyDeviceToHost, stream.value()); + inf_stat.device_to_host(stream, true); + ASSERT_EQ(inf_stat[0].status, 0); } - - cudf::io::gpu_inflate_input_s* inf_args = nullptr; - cudf::io::gpu_inflate_status_s* inf_stat = nullptr; }; /** * @brief Derived fixture for GZIP decompression */ struct GzipDecompressTest : public DecompressTest { - cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args, - cudf::io::gpu_inflate_status_s* d_inf_stat) + void dispatch(device_span> d_inf_in, + device_span> d_inf_out, + device_span d_inf_stat) { - return cudf::io::gpuinflate(d_inf_args, d_inf_stat, 1, 1, rmm::cuda_stream_default); + cudf::io::gpuinflate(d_inf_in, + d_inf_out, + d_inf_stat, + cudf::io::gzip_header_included::YES, + rmm::cuda_stream_default); } }; @@ -105,10 +89,11 @@ struct GzipDecompressTest : public DecompressTest { * @brief Derived fixture for Snappy decompression */ struct SnappyDecompressTest : public DecompressTest { - cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args, - cudf::io::gpu_inflate_status_s* d_inf_stat) + void dispatch(device_span> d_inf_in, + device_span> d_inf_out, + device_span d_inf_stat) { - return cudf::io::gpu_unsnap(d_inf_args, d_inf_stat, 1, rmm::cuda_stream_default); + cudf::io::gpu_unsnap(d_inf_in, d_inf_out, d_inf_stat, rmm::cuda_stream_default); } }; @@ -116,14 +101,19 @@ struct SnappyDecompressTest : public DecompressTest { * @brief Derived fixture for Brotli decompression */ struct BrotliDecompressTest : public DecompressTest { - cudaError_t dispatch(cudf::io::gpu_inflate_input_s* d_inf_args, - cudf::io::gpu_inflate_status_s* d_inf_stat) + void dispatch(device_span> d_inf_in, + device_span> d_inf_out, + device_span d_inf_stat) { rmm::device_buffer d_scratch{cudf::io::get_gpu_debrotli_scratch_size(1), rmm::cuda_stream_default}; - return cudf::io::gpu_debrotli( - d_inf_args, d_inf_stat, d_scratch.data(), d_scratch.size(), 1, rmm::cuda_stream_default); + cudf::io::gpu_debrotli(d_inf_in, + d_inf_out, + d_inf_stat, + d_scratch.data(), + d_scratch.size(), + rmm::cuda_stream_default); } }; From 84f88ceb18225850835a9912a18e4c82245d5620 Mon Sep 17 00:00:00 2001 From: MithunR Date: Thu, 28 Apr 2022 23:45:40 -0700 Subject: [PATCH 10/28] Support purging non-empty null elements from LIST/STRING columns (#10701) Fixes #10291. With certain operations in `libcudf`, it is possible to produce `LIST` columns with `NULL` rows that are not also empty. For instance, consider a `STRUCT` column is constructed with an explicit validity buffer and a `LIST` child column: ```c++ auto const lists = lists_column_wrapper{ {0,1}, {2,3}, {4,5} }; auto const structs = structs_column_wrapper{ {lists}, null_at(1) }; ``` Since `structs[1] == NULL`, its `LIST` member is also deemed null. However, for efficiency, the null-ness is recorded in the `LIST`'s validity buffer, without purging the unnecessary values from its child. The `LIST` columns appears as follows: ``` Validity: 101 Offsets: [0, 2, 4, 6] Child: [0, 1, 2, 3, 4, 5] ``` Even though Row#1 is null, its size is `4-2 = 2`, and not `0`. (Row#1 is thus a non-empty null row.) This commit adds a `cudf::purge_nonempty_nulls()` function that purges such rows, and reduces such columns to a more space-efficient representation, i.e.: ``` Validity: 101 Offsets: [0, 2, 2, 4] Child: [0, 1, 4, 5] ``` This commit also modifies `cudf::gather()` not to produce `STRING`/`LIST` columns with "dirty" rows. Further, it adds two new functions to determine if a specified column needs such purging: 1. `cudf::may_have_nonempty_nulls()`: A fast check to check a column for the *possibility* of having non-empty nulls. This only checks whether the column or its descendants have null rows at all. If there are no nulls anywhere in the hierarchy, it does not need purging. 2. `cudf::has_nonempty_nulls()`: A deeper, more expensive check that categorically confirms whether non-empty null rows exist in any column in the hierarchy. Authors: - MithunR (https://github.com/mythrocks) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - https://github.com/nvdbaranec - Jordan Jacobelli (https://github.com/Ethyling) URL: https://github.com/rapidsai/cudf/pull/10701 --- conda/recipes/libcudf/meta.yaml | 1 + cpp/CMakeLists.txt | 1 + cpp/include/cudf/copying.hpp | 153 ++++++ cpp/include/cudf/detail/copy.cuh | 47 ++ cpp/include/cudf/detail/copy.hpp | 19 +- cpp/include/cudf/lists/detail/gather.cuh | 45 +- cpp/include/cudf/strings/detail/gather.cuh | 20 +- .../cudf/structs/structs_column_view.hpp | 7 +- cpp/src/copying/purge_nonempty_nulls.cu | 134 ++++++ cpp/src/structs/structs_column_view.cpp | 2 + cpp/tests/CMakeLists.txt | 1 + cpp/tests/column/factories_test.cpp | 2 +- .../copying/purge_nonempty_nulls_tests.cpp | 437 ++++++++++++++++++ 13 files changed, 847 insertions(+), 22 deletions(-) create mode 100644 cpp/include/cudf/detail/copy.cuh create mode 100644 cpp/src/copying/purge_nonempty_nulls.cu create mode 100644 cpp/tests/copying/purge_nonempty_nulls_tests.cpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 0806bb964cf..68008e13897 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -79,6 +79,7 @@ outputs: - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh - test -f $PREFIX/include/cudf/detail/concatenate.hpp - test -f $PREFIX/include/cudf/detail/copy.hpp + - test -f $PREFIX/include/cudf/detail/copy.cuh - test -f $PREFIX/include/cudf/detail/datetime.hpp - test -f $PREFIX/include/cudf/detail/fill.hpp - test -f $PREFIX/include/cudf/detail/gather.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 15caaec9bec..cbe2811afe4 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -238,6 +238,7 @@ add_library( src/copying/gather.cu src/copying/get_element.cu src/copying/pack.cpp + src/copying/purge_nonempty_nulls.cu src/copying/reverse.cu src/copying/sample.cu src/copying/scatter.cu diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp index 2e559afef4f..8f1ad7da9b6 100644 --- a/cpp/include/cudf/copying.hpp +++ b/cpp/include/cudf/copying.hpp @@ -17,7 +17,10 @@ #pragma once #include +#include #include +#include +#include #include #include @@ -939,5 +942,155 @@ std::unique_ptr
sample( int64_t const seed = 0, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Checks if a column or its descendants have non-empty null rows + * + * @note This function is exact. If it returns `true`, there exists one or more + * non-empty null elements. + * + * A LIST or STRING column might have non-empty rows that are marked as null. + * A STRUCT OR LIST column might have child columns that have non-empty null rows. + * Other types of columns are deemed incapable of having non-empty null rows. + * E.g. Fixed width columns have no concept of an "empty" row. + * + * @param input The column which is (and whose descendants are) to be checked for + * non-empty null rows. + * @return true If either the column or its descendants have non-empty null rows. + * @return false If neither the column or its descendants have non-empty null rows. + */ +bool has_nonempty_nulls(column_view const& input); + +/** + * @brief Approximates if a column or its descendants *may* have non-empty null elements + * + * @note This function is approximate. + * - `true`: Non-empty null elements could exist + * - `false`: Non-empty null elements definitely do not exist + * + * False positives are possible, but false negatives are not. + * + * Compared to the exact `has_nonempty_nulls()` function, this function is typically + * more efficient. + * + * Complexity: + * - Best case: `O(count_descendants(input))` + * - Worst case: `O(count_descendants(input)) * m`, where `m` is the number of rows in the largest + * descendant + * + * @param input The column which is (and whose descendants are) to be checked for + * non-empty null rows + * @return true If either the column or its decendants have null rows + * @return false If neither the column nor its descendants have null rows + */ +bool may_have_nonempty_nulls(column_view const& input); + +/** + * @brief Copies `input`, purging any non-empty null rows in the column or its descendants + * + * LIST columns may have non-empty null rows. + * For example: + * @code{.pseudo} + * + * auto const lists = lists_column_wrapper{ {0,1}, {2,3}, {4,5} }.release(); + * cudf::detail::set_null_mask(lists->null_mask(), 1, 2, false); + * + * lists[1] is now null, but the lists child column still stores `{2,3}`. + * The lists column contents will be: + * Validity: 101 + * Offsets: [0, 2, 4, 6] + * Child: [0, 1, 2, 3, 4, 5] + * + * After purging the contents of the list's null rows, the column's contents + * will be: + * Validity: 101 + * Offsets: [0, 2, 2, 4] + * Child: [0, 1, 4, 5] + * @endcode + * + * The purge operation only applies directly to LIST and STRING columns, but it + * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns + * may have child/decendant columns that are LIST or STRING. + * + * @param input The column whose null rows are to be checked and purged + * @param mr Device memory resource used to allocate the returned column's device memory + * @return std::unique_ptr Column with equivalent contents to `input`, but with + * the contents of null rows purged + */ +std::unique_ptr purge_nonempty_nulls( + lists_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Copies `input`, purging any non-empty null rows in the column or its descendants + * + * STRING columns may have non-empty null rows. + * For example: + * @code{.pseudo} + * + * auto const strings = strings_column_wrapper{ "AB", "CD", "EF" }.release(); + * cudf::detail::set_null_mask(strings->null_mask(), 1, 2, false); + * + * strings[1] is now null, but the strings column still stores `"CD"`. + * The lists column contents will be: + * Validity: 101 + * Offsets: [0, 2, 4, 6] + * Child: [A, B, C, D, E, F] + * + * After purging the contents of the list's null rows, the column's contents + * will be: + * Validity: 101 + * Offsets: [0, 2, 2, 4] + * Child: [A, B, E, F] + * @endcode + * + * The purge operation only applies directly to LIST and STRING columns, but it + * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns + * may have child/decendant columns that are LIST or STRING. + * + * @param input The column whose null rows are to be checked and purged + * @param mr Device memory resource used to allocate the returned column's device memory + * @return std::unique_ptr Column with equivalent contents to `input`, but with + * the contents of null rows purged + */ +std::unique_ptr purge_nonempty_nulls( + strings_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Copies `input`, purging any non-empty null rows in the column or its descendants + * + * STRUCTS columns may have null rows, with non-empty child rows. + * For example: + * @code{.pseudo} + * + * auto const lists = lists_column_wrapper{ {0,1}, {2,3}, {4,5} }; + * auto const structs = structs_column_wrapper{ {lists}, null_at(1) }; + * + * structs[1].child is now null, but the lists column still stores `{2,3}`. + * The lists column contents will be: + * Validity: 101 + * Offsets: [0, 2, 4, 6] + * Child: [0, 1, 2, 3, 4, 5] + * + * After purging the contents of the list's null rows, the column's contents + * will be: + * Validity: 101 + * Offsets: [0, 2, 2, 4] + * Child: [0, 1, 4, 5] + * @endcode + * + * The purge operation only applies directly to LIST and STRING columns, but it + * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns + * may have child/decendant columns that are LIST or STRING. + * + * @param input The column whose null rows are to be checked and purged + * @param mr Device memory resource used to allocate the returned column's device memory + * @return std::unique_ptr Column with equivalent contents to `input`, but with + * the contents of null rows purged + */ +std::unique_ptr purge_nonempty_nulls( + structs_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ } // namespace cudf diff --git a/cpp/include/cudf/detail/copy.cuh b/cpp/include/cudf/detail/copy.cuh new file mode 100644 index 00000000000..773bce7131f --- /dev/null +++ b/cpp/include/cudf/detail/copy.cuh @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +namespace cudf::detail { + +/** + * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*) + * + * @tparam ColumnViewT View type (lists_column_view, strings_column_view, or strings_column_view) + * @param stream CUDA stream used for device memory operations and kernel launches + */ +template +std::unique_ptr purge_nonempty_nulls(ColumnViewT const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Implement via identity gather. + auto const input_column = input.parent(); + auto const gather_begin = thrust::counting_iterator(0); + auto const gather_end = gather_begin + input_column.size(); + + auto gathered_table = cudf::detail::gather(table_view{{input_column}}, + gather_begin, + gather_end, + out_of_bounds_policy::DONT_CHECK, + stream, + mr); + return std::move(gathered_table->release()[0]); +} + +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp index 50157d16876..abd14fbda89 100644 --- a/cpp/include/cudf/detail/copy.hpp +++ b/cpp/include/cudf/detail/copy.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2018-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -299,5 +299,22 @@ std::unique_ptr get_element( size_type index, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @copydoc cudf::has_nonempty_nulls + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +bool has_nonempty_nulls(column_view const& input, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + +/** + * @copydoc cudf::may_have_nonempty_nulls + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +bool may_have_nonempty_nulls(column_view const& input, + rmm::cuda_stream_view stream = rmm::cuda_stream_default); + } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh index c637ad041ba..7df36be2385 100644 --- a/cpp/include/cudf/lists/detail/gather.cuh +++ b/cpp/include/cudf/lists/detail/gather.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -82,6 +83,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, auto dst_offsets_c = cudf::make_fixed_width_column( data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr); mutable_column_view dst_offsets_v = dst_offsets_c->mutable_view(); + auto const source_column_nullmask = source_column.null_mask(); // generate the compacted outgoing offsets. auto count_iter = thrust::make_counting_iterator(0); @@ -90,12 +92,23 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, count_iter, count_iter + offset_count, dst_offsets_v.begin(), - [gather_map, output_count, src_offsets, src_size] __device__(int32_t index) -> int32_t { + [source_column_nullmask, + source_column_offset = source_column.offset(), + gather_map, + output_count, + src_offsets, + src_size] __device__(int32_t index) -> int32_t { int32_t offset_index = index < output_count ? gather_map[index] : 0; // if this is an invalid index, this will be a NULL list if (NullifyOutOfBounds && ((offset_index < 0) || (offset_index >= src_size))) { return 0; } + // If the source row is null, the output row size must be 0. + if (source_column_nullmask != nullptr && + not cudf::bit_is_set(source_column_nullmask, source_column_offset + offset_index)) { + return 0; + } + // the length of this list return src_offsets[offset_index + 1] - src_offsets[offset_index]; }, @@ -110,15 +123,27 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column, // generate the base offsets rmm::device_uvector base_offsets = rmm::device_uvector(output_count, stream); - thrust::transform(rmm::exec_policy(stream), - gather_map, - gather_map + output_count, - base_offsets.data(), - [src_offsets, src_size, shift] __device__(int32_t index) { - // if this is an invalid index, this will be a NULL list - if (NullifyOutOfBounds && ((index < 0) || (index >= src_size))) { return 0; } - return src_offsets[index] - shift; - }); + thrust::transform( + rmm::exec_policy(stream), + gather_map, + gather_map + output_count, + base_offsets.data(), + [source_column_nullmask, + source_column_offset = source_column.offset(), + src_offsets, + src_size, + shift] __device__(int32_t index) { + // if this is an invalid index, this will be a NULL list + if (NullifyOutOfBounds && ((index < 0) || (index >= src_size))) { return 0; } + + // If the source row is null, the output row size must be 0. + if (source_column_nullmask != nullptr && + not cudf::bit_is_set(source_column_nullmask, source_column_offset + index)) { + return 0; + } + + return src_offsets[index] - shift; + }); // Retrieve size of the resulting gather map for level N+1 (the last offset) size_type child_gather_map_size = diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 1b10c70d6d6..d46ab3a91a1 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -303,14 +303,17 @@ std::unique_ptr gather( data_type{type_id::INT32}, output_count + 1, mask_state::UNALLOCATED, stream, mr); auto const d_out_offsets = out_offsets_column->mutable_view().template data(); auto const d_in_offsets = (strings_count > 0) ? strings.offsets_begin() : nullptr; - thrust::transform(rmm::exec_policy(stream), - begin, - end, - d_out_offsets, - [d_in_offsets, strings_count] __device__(size_type in_idx) { - if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0; - return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx]; - }); + auto const d_strings = column_device_view::create(strings.parent(), stream); + thrust::transform( + rmm::exec_policy(stream), + begin, + end, + d_out_offsets, + [d_strings = *d_strings, d_in_offsets, strings_count] __device__(size_type in_idx) { + if (NullifyOutOfBounds && (in_idx < 0 || in_idx >= strings_count)) return 0; + if (not d_strings.is_valid(in_idx)) return 0; + return d_in_offsets[in_idx + 1] - d_in_offsets[in_idx]; + }); // check total size is not too large size_t const total_bytes = thrust::transform_reduce( @@ -329,7 +332,6 @@ std::unique_ptr gather( // build chars column cudf::device_span const d_out_offsets_span(d_out_offsets, output_count + 1); - auto const d_strings = column_device_view::create(strings.parent(), stream); auto out_chars_column = gather_chars(d_strings->begin(), begin, end, diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp index 329c24cfe0a..ca866d8555e 100644 --- a/cpp/include/cudf/structs/structs_column_view.hpp +++ b/cpp/include/cudf/structs/structs_column_view.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,6 +41,11 @@ class structs_column_view : public column_view { explicit structs_column_view(column_view const& rhs); + /** + * @brief Returns the parent column. + */ + [[nodiscard]] column_view parent() const; + using column_view::child_begin; using column_view::child_end; using column_view::has_nulls; diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu new file mode 100644 index 00000000000..778d6c4df55 --- /dev/null +++ b/cpp/src/copying/purge_nonempty_nulls.cu @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +#include + +namespace cudf { +namespace detail { + +using cudf::type_id; + +namespace { + +/// Check if nonempty-null checks can be skipped for a given type. +bool type_may_have_nonempty_nulls(cudf::type_id const& type) +{ + return type == type_id::STRING || type == type_id::LIST || type == type_id::STRUCT; +} + +/// Check if the (STRING/LIST) column has any null rows with non-zero length. +bool has_nonempty_null_rows(cudf::column_view const& input, rmm::cuda_stream_view stream) +{ + if (not input.has_nulls()) { return false; } // No nulls => no dirty rows. + + // Cross-reference nullmask and offsets. + auto const type = input.type().id(); + auto const offsets = (type == type_id::STRING) ? (strings_column_view{input}).offsets() + : (lists_column_view{input}).offsets(); + auto const d_input = cudf::column_device_view::create(input); + auto const is_dirty_row = [d_input = *d_input, offsets = offsets.begin()] __device__( + size_type const& row_idx) { + return d_input.is_null_nocheck(row_idx) && (offsets[row_idx] != offsets[row_idx + 1]); + }; + + auto const row_begin = thrust::counting_iterator(0); + auto const row_end = row_begin + input.size(); + return thrust::count_if(rmm::exec_policy(stream), row_begin, row_end, is_dirty_row) > 0; +} + +} // namespace + +/** + * @copydoc cudf::detail::has_nonempty_nulls + */ +bool has_nonempty_nulls(cudf::column_view const& input, rmm::cuda_stream_view stream) +{ + auto const type = input.type().id(); + + if (not type_may_have_nonempty_nulls(type)) { return false; } + + // For types with variable-length rows, check if any rows are "dirty". + // A dirty row is a null row with non-zero length. + if ((type == type_id::STRING || type == type_id::LIST) && has_nonempty_null_rows(input, stream)) { + return true; + } + + // For complex types, check if child columns need purging. + if ((type == type_id::STRUCT || type == type_id::LIST) && + std::any_of(input.child_begin(), input.child_end(), [stream](auto const& child) { + return cudf::detail::has_nonempty_nulls(child, stream); + })) { + return true; + } + + return false; +} +} // namespace detail + +/** + * @copydoc cudf::may_have_nonempty_nulls + */ +bool may_have_nonempty_nulls(column_view const& input) +{ + auto const type = input.type().id(); + + if (not detail::type_may_have_nonempty_nulls(type)) { return false; } + + if ((type == type_id::STRING || type == type_id::LIST) && input.has_nulls()) { return true; } + + if ((type == type_id::STRUCT || type == type_id::LIST) && + std::any_of(input.child_begin(), input.child_end(), may_have_nonempty_nulls)) { + return true; + } + + return false; +} + +/** + * @copydoc cudf::has_nonempty_nulls + */ +bool has_nonempty_nulls(column_view const& input) { return detail::has_nonempty_nulls(input); } + +/** + * @copydoc cudf::purge_nonempty_nulls(lists_column_view const&, rmm::mr::device_memory_resource*) + */ +std::unique_ptr purge_nonempty_nulls(lists_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr); +} + +/** + * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*) + */ +std::unique_ptr purge_nonempty_nulls(structs_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr); +} + +/** + * @copydoc cudf::purge_nonempty_nulls(strings_column_view const&, rmm::mr::device_memory_resource*) + */ +std::unique_ptr purge_nonempty_nulls(strings_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + return detail::purge_nonempty_nulls(input, rmm::cuda_stream_default, mr); +} + +} // namespace cudf diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp index 681f13386ff..7d8c8837d2d 100644 --- a/cpp/src/structs/structs_column_view.cpp +++ b/cpp/src/structs/structs_column_view.cpp @@ -25,6 +25,8 @@ structs_column_view::structs_column_view(column_view const& rhs) : column_view{r CUDF_EXPECTS(type().id() == type_id::STRUCT, "structs_column_view only supports struct columns"); } +column_view structs_column_view::parent() const { return *this; } + column_view structs_column_view::get_sliced_child(int index) const { std::vector children; diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e016f47616b..95c54d7596e 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -250,6 +250,7 @@ ConfigureTest( copying/gather_tests.cpp copying/get_value_tests.cpp copying/pack_tests.cpp + copying/purge_nonempty_nulls_tests.cpp copying/sample_tests.cpp copying/scatter_tests.cpp copying/scatter_list_tests.cpp diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp index 4e0e70bf15c..44a79e63cd8 100644 --- a/cpp/tests/column/factories_test.cpp +++ b/cpp/tests/column/factories_test.cpp @@ -645,7 +645,7 @@ TYPED_TEST(ListsStructsLeafTest, FromNonNested) 0, cudf::create_null_mask(2, cudf::mask_state::UNALLOCATED)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*col, *expected); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *expected); } TYPED_TEST(ListsStructsLeafTest, FromNested) diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp new file mode 100644 index 00000000000..77fd3f66ee5 --- /dev/null +++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cudf::test { + +using iterators::no_nulls; +using iterators::null_at; +using iterators::nulls_at; +using T = int32_t; // The actual type of the leaf node isn't really important. +using values_col_t = fixed_width_column_wrapper; +using offsets_col_t = fixed_width_column_wrapper; +using gather_map_t = fixed_width_column_wrapper; + +template +using LCW = cudf::test::lists_column_wrapper; + +struct PurgeNonEmptyNullsTest : public cudf::test::BaseFixture { + /// Helper to run gather() on a single column, and extract the single column from the result. + std::unique_ptr gather(column_view const& input, gather_map_t const& gather_map) + { + auto gathered = + cudf::gather(cudf::table_view{{input}}, gather_map, out_of_bounds_policy::NULLIFY); + return std::move(gathered->release()[0]); + } + + /// Verify that the result of `sanitize()` is equivalent to the unsanitized input, + /// except that the null rows are also empty. + template + void test_purge(ColumnViewT const& unpurged) + { + auto const purged = cudf::purge_nonempty_nulls(unpurged); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(unpurged.parent(), *purged); + EXPECT_FALSE(cudf::has_nonempty_nulls(*purged)); + } +}; + +// List. +TEST_F(PurgeNonEmptyNullsTest, SingleLevelList) +{ + auto const input = LCW{{{{1, 2, 3, 4}, null_at(2)}, + {5}, + {6, 7}, // <--- Will be set to NULL. Unsanitized row. + {8, 9, 10}}, + no_nulls()} + .release(); + EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*input)); + + // Set nullmask, post construction. + cudf::detail::set_null_mask(input->mutable_view().null_mask(), 2, 3, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); + + test_purge(lists_column_view{*input}); + + { + // Selecting all rows from input, in different order. + auto const results = gather(input->view(), {1, 2, 0, 3}); + auto const results_list_view = lists_column_view(*results); + + auto const expected = LCW{{{5}, + {}, // NULL. + {{1, 2, 3, 4}, null_at(2)}, + {8, 9, 10}}, + null_at(1)}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 1, 1, 5, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.child(), + values_col_t{{5, 1, 2, 3, 4, 8, 9, 10}, null_at(3)}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } + { + // Test when gather selects rows preceded by unsanitized rows. + auto const results = gather(input->view(), {3, 100, 0}); + auto const expected = LCW{{ + {8, 9, 10}, + {}, // NULL. + {{1, 2, 3, 4}, null_at(2)}, + }, + null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } + { + // Test when gather selects rows followed by unsanitized rows. + auto const results = gather(input->view(), {1, 100, 0}); + auto const expected = LCW{{ + {5}, + {}, // NULL. + {{1, 2, 3, 4}, null_at(2)}, + }, + null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } + { + // Test when gather selects unsanitized row specifically. + auto const results = gather(input->view(), {2}); + auto const results_lists_view = lists_column_view(*results); + auto const expected = LCW{{ + LCW{} // NULL. + }, + null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.child(), values_col_t{}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } +} + +// List>. +TEST_F(PurgeNonEmptyNullsTest, TwoLevelList) +{ + auto const input = + LCW{ + {{{1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}}, + {{11, 12}, {13, 14, 15}, {16, 17, 18}, {19}}, + {{21}, {22, 23}, {24, 25, 26}}, + {{31, 32}, {33, 34, 35, 36}, {}, {37, 38}}, //<--- Will be set to NULL. Unsanitized row. + {{41}, {42, 43}}}, + no_nulls()} + .release(); + EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*input)); + + // Set nullmask, post construction. + cudf::detail::set_null_mask(input->mutable_view().null_mask(), 3, 4, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); + + test_purge(lists_column_view{*input}); + + { + // Verify that gather() output is sanitized. + auto const results = gather(input->view(), {100, 3, 0, 1}); + auto const results_lists_view = lists_column_view(*results); + + auto const expected = LCW{{ + LCW{}, // NULL, because of out of bounds. + LCW{}, // NULL, because input row was null. + {{1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}}, // i.e. input[0] + {{11, 12}, {13, 14, 15}, {16, 17, 18}, {19}} // i.e. input[1] + }, + nulls_at({0, 1})}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0, 0, 5, 9}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + results_lists_view.child(), + LCW{ + {1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}, {11, 12}, {13, 14, 15}, {16, 17, 18}, {19}}); + + auto const child_lists_view = lists_column_view(results_lists_view.child()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(child_lists_view.offsets(), + offsets_col_t{0, 3, 7, 8, 10, 11, 13, 16, 19, 20}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + child_lists_view.child(), + values_col_t{1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 11, 12, 13, 14, 15, 16, 17, 18, 19}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } +} + +// List>>. +TEST_F(PurgeNonEmptyNullsTest, ThreeLevelList) +{ + auto const input = LCW{{{{{1, 2}, {3}}, {{4, 5}, {6, 7}}, {{8, 8}, {}}, {{9, 1}}, {{2, 3}}}, + {{{11, 12}}, {{13}, {14, 15}}, {{16, 17, 18}}, {{19, 19}, {}}}, + {{{21, 21}}, {{22, 23}, {}}, {{24, 25}, {26}}}, + {{{31, 32}, {}}, + {{33, 34, 35}, {36}}, + {}, + {{37, 38}}}, //<--- Will be set to NULL. Unsanitized row. + {{{41, 41, 41}}, {{42, 43}}}}, + no_nulls()} + .release(); + EXPECT_FALSE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*input)); + + // Set nullmask, post construction. + cudf::detail::set_null_mask(input->mutable_view().null_mask(), 3, 4, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); + + test_purge(lists_column_view{*input}); + + { + auto const results = gather(input->view(), {100, 3, 0, 1}); + auto const results_lists_view = lists_column_view(*results); + + auto const expected = LCW{ + { + LCW{}, // NULL, because of out of bounds. + LCW{}, // NULL, because input row was null. + {{{1, 2}, {3}}, {{4, 5}, {6, 7}}, {{8, 8}, {}}, {{9, 1}}, {{2, 3}}}, // i.e. input[0] + {{{11, 12}}, {{13}, {14, 15}}, {{16, 17, 18}}, {{19, 19}, {}}} // i.e. input[1] + }, + nulls_at({0, 1})}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 0, 0, 5, 9}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.child(), + LCW{{{1, 2}, {3}}, + {{4, 5}, {6, 7}}, + {{8, 8}, {}}, + {{9, 1}}, + {{2, 3}}, + {{11, 12}}, + {{13}, {14, 15}}, + {{16, 17, 18}}, + {{19, 19}, {}}}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } +} + +// List. +TEST_F(PurgeNonEmptyNullsTest, ListOfStrings) +{ + using T = string_view; + + auto const input = LCW{{{{"1", "22", "", "4444"}, null_at(2)}, + {"55555"}, + {"666666", "7777777"}, // <--- Will be set to NULL. Unsanitized row. + {"88888888", "999999999", "1010101010"}, + {"11", "22", "33", "44"}, + {"55", "66", "77", "88"}}, + no_nulls()} + .release(); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*input)); + + // Set nullmask, post construction. + cudf::detail::set_null_mask(input->mutable_view().null_mask(), 2, 3, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*input)); + + test_purge(lists_column_view{*input}); + + { + // Selecting all rows from input, in different order. + auto const results = gather(input->view(), {1, 2, 0, 3}); + auto const results_list_view = lists_column_view(*results); + + auto const expected = LCW{{{"55555"}, + {}, // NULL. + {{"1", "22", "", "4444"}, null_at(2)}, + {"88888888", "999999999", "1010101010"}}, + null_at(1)}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 1, 1, 5, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + results_list_view.child(), + strings_column_wrapper{ + {"55555", "1", "22", "", "4444", "88888888", "999999999", "1010101010"}, null_at(3)}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } + { + // Gathering from a sliced column. + auto const sliced = cudf::slice({input->view()}, {1, 5})[0]; // Lop off 1 row at each end. + EXPECT_TRUE(cudf::may_have_nonempty_nulls(sliced)); + EXPECT_TRUE(cudf::has_nonempty_nulls(sliced)); + + auto const results = gather(sliced, {1, 2, 0, 3}); + auto const results_list_view = lists_column_view(*results); + auto const expected = LCW{{ + {}, + {"88888888", "999999999", "1010101010"}, + {"55555"}, + {"11", "22", "33", "44"}, + }, + null_at(0)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 0, 3, 4, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + results_list_view.child(), + strings_column_wrapper{ + "88888888", "999999999", "1010101010", "55555", "11", "22", "33", "44"}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*results)); + } +} + +// List. +TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings) +{ + auto strings = + strings_column_wrapper{ + {"1", "22", "3", "44", "5", "66", "7", "8888", "9", "1010"}, //<--- "8888" will be + // unsanitized. + no_nulls()} + .release(); + EXPECT_FALSE(cudf::may_have_nonempty_nulls(*strings)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*strings)); + + // Set strings nullmask, post construction. + set_null_mask(strings->mutable_view().null_mask(), 7, 8, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*strings)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*strings)); + + test_purge(strings_column_view{*strings}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + strings_column_view(*strings).offsets(), offsets_col_t{0, 1, 3, 4, 6, 7, 9, 10, 14, 15, 19} + // 10-14 indicates that "8888" is unsanitized. + ); + + // Construct a list column from the strings column. + auto const lists = make_lists_column(4, + offsets_col_t{0, 4, 5, 7, 10}.release(), + std::move(strings), + 0, + detail::make_null_mask(no_nulls(), no_nulls() + 4)); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*lists)); + + // Set lists nullmask, post construction. + cudf::detail::set_null_mask(lists->mutable_view().null_mask(), 2, 3, false); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*lists)); + + test_purge(lists_column_view{*lists}); + + // At this point, + // 1. {"66", "7"} will be unsanitized. + // 2. {"8888", "9", "1010"} will be actually be {NULL, "9", "1010"}. + + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + lists_column_view(*lists).offsets(), + offsets_col_t{0, 4, 5, 7, 10}); // 5-7 indicates that list row#2 is unsanitized. + + auto const result = gather(lists->view(), {1, 2, 0, 3}); + auto const expected = LCW{{{"5"}, + {}, // NULL. + {"1", "22", "3", "44"}, + {{"", "9", "1010"}, null_at(0)}}, + null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected); + + // Ensure row#2 has been sanitized. + auto const results_lists_view = lists_column_view(*result); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 1, 1, 5, 8} + // 1-1 indicates that row#2 is sanitized. + ); + + // Ensure that "8888" has been sanitized, and stored as "". + auto const child_strings_view = strings_column_view(results_lists_view.child()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(child_strings_view.offsets(), + offsets_col_t{0, 1, 2, 4, 5, 7, 7, 8, 12}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*result)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*result)); +} + +// Struct>. +TEST_F(PurgeNonEmptyNullsTest, StructOfList) +{ + auto const structs_input = + [] { + auto child = LCW{{{{1, 2, 3, 4}, null_at(2)}, + {5}, + {6, 7}, //<--- Unsanitized row. + {8, 9, 10}}, + no_nulls()}; + EXPECT_FALSE(cudf::has_nonempty_nulls(child)); + return structs_column_wrapper{{child}, null_at(2)}; + }() + .release(); + + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*structs_input)); + EXPECT_TRUE(cudf::has_nonempty_nulls(*structs_input)); + + test_purge(structs_column_view{*structs_input}); + + // At this point, even though the structs column has a null at index 2, + // the child column has a non-empty list row at index 2: {6, 7}. + CUDF_TEST_EXPECT_COLUMNS_EQUAL(lists_column_view(structs_input->child(0)).child(), + values_col_t{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, null_at(2)}); + + { + // Test rearrange. + auto const gather_map = gather_map_t{1, 2, 0, 3}; + auto const result = gather(structs_input->view(), gather_map); + auto const expected_result = [] { + auto child = LCW{{{5}, + LCW{}, //<--- Now, sanitized. + {{1, 2, 3, 4}, null_at(2)}, + {8, 9, 10}}, + null_at(1)}; + return structs_column_wrapper{{child}, null_at(1)}; + }(); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_result); + auto const results_child = lists_column_view(result->child(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_child.offsets(), offsets_col_t{0, 1, 1, 5, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_child.child(), + values_col_t{{5, 1, 2, 3, 4, 8, 9, 10}, null_at(3)}); + EXPECT_TRUE(cudf::may_have_nonempty_nulls(*result)); + EXPECT_FALSE(cudf::has_nonempty_nulls(*result)); + } +} + +} // namespace cudf::test From 3c208a618f7f3443d021c01ad27f560a7d71e7d7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 29 Apr 2022 09:36:29 -0400 Subject: [PATCH 11/28] Enable pydocstyle rules involving quotes (#10748) This PR enables D30* errors for pydocstyle. It also sets up the `ignore-decorators` configuration so that future PRs involving D10* errors will treat docutils decorators appropriately. Contributes to #10711. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10748 --- .pre-commit-config.yaml | 15 ++++++ python/.flake8 | 24 +++++----- python/cudf/cudf/comm/gpuarrow.py | 4 +- python/cudf/cudf/core/column/string.py | 66 +++++++++++++------------- python/cudf/cudf/core/frame.py | 4 +- python/cudf/cudf/core/series.py | 4 +- 6 files changed, 66 insertions(+), 51 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5f690f5f827..cd7b8aea6d7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,5 @@ +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + repos: - repo: https://github.com/PyCQA/isort rev: 5.6.4 @@ -56,6 +58,19 @@ repos: hooks: - id: pydocstyle args: ["--config=python/.flake8"] + exclude: | + (?x)^( + ci| + cpp| + conda| + docs| + java| + notebooks| + python/dask_cudf| + python/cudf_kafka| + python/custreamz| + python/cudf/cudf/tests + ) - repo: https://github.com/pre-commit/mirrors-clang-format rev: v11.1.0 hooks: diff --git a/python/.flake8 b/python/.flake8 index c645c46a216..667875030cc 100644 --- a/python/.flake8 +++ b/python/.flake8 @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. [flake8] exclude = __init__.py @@ -9,14 +9,14 @@ ignore = E203 [pydocstyle] -match = ^(.*abc\.py|.*api/types\.py|.*single_column_frame\.py|.*indexed_frame\.py)$ -# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather than include using match-dir. -match-dir = ^(?!ci|cpp|python/dask_cudf|python/cudf_kafka|python/custreamz).*$ -# In addition to numpy style, we additionally ignore: -add-ignore = - # magic methods - D105, - # no docstring in __init__ - D107, - # newlines before docstrings - D204 +# Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather +# than include using match-dir. Note that as discussed in +# https://stackoverflow.com/questions/65478393/how-to-filter-directories-using-the-match-dir-flag-for-pydocstyle, +# unlike the match option above this match-dir will have no effect when +# pydocstyle is invoked from pre-commit. Therefore this exclusion list must +# also be maintained in the pre-commit config file. +match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks|dask_cudf|cudf_kafka|custreamz|tests)).*$ +# Allow missing docstrings for docutils +ignore-decorators = .*(docutils|doc_apply|copy_docstring).* +select = + D30 diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py index 09b4cc5ffba..0c4d9d7f77e 100644 --- a/python/cudf/cudf/comm/gpuarrow.py +++ b/python/cudf/cudf/comm/gpuarrow.py @@ -119,12 +119,12 @@ def null(self): @property def data_raw(self): - "Accessor for the data buffer as a device array" + """Accessor for the data buffer as a device array""" return self._series._column.data_array_view @property def null_raw(self): - "Accessor for the null buffer as a device array" + """Accessor for the null buffer as a device array""" return self._series._column.mask_array_view def make_series(self): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 1d836d9b759..0db7e7d9a27 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -201,7 +201,7 @@ def __getitem__(self, key): return self.get(key) def len(self) -> SeriesOrIndex: - """ + r""" Computes the length of each element in the Series/Index. Returns @@ -213,7 +213,7 @@ def len(self) -> SeriesOrIndex: Examples -------- >>> import cudf - >>> s = cudf.Series(["dog", "", "\\n", None]) + >>> s = cudf.Series(["dog", "", "\n", None]) >>> s.str.len() 0 3 1 0 @@ -960,7 +960,7 @@ def replace( ) def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: - """ + r""" Use the ``repl`` back-ref template to create a new string with the extracted elements found using the ``pat`` expression. @@ -980,7 +980,7 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: -------- >>> import cudf >>> s = cudf.Series(["A543","Z756"]) - >>> s.str.replace_with_backrefs('(\\\\d)(\\\\d)', 'V\\\\2\\\\1') + >>> s.str.replace_with_backrefs('(\\d)(\\d)', 'V\\2\\1') 0 AV453 1 ZV576 dtype: object @@ -1195,7 +1195,7 @@ def istimestamp(self, format: str) -> SeriesOrIndex: ) def isfloat(self) -> SeriesOrIndex: - """ + r""" Check whether all characters in each string form floating value. If a string has zero characters, False is returned for @@ -1249,7 +1249,7 @@ def isfloat(self) -> SeriesOrIndex: 4 True 5 False dtype: bool - >>> s = cudf.Series(["this is plain text", "\\t\\n", "9.9", "9.9.9"]) + >>> s = cudf.Series(["this is plain text", "\t\n", "9.9", "9.9.9"]) >>> s.str.isfloat() 0 False 1 False @@ -2239,7 +2239,7 @@ def get(self, i: int = 0) -> SeriesOrIndex: return self._return_or_inplace(libstrings.get(self._column, i)) def get_json_object(self, json_path): - """ + r""" Applies a JSONPath string to an input strings column where each row in the column is a valid json string @@ -2258,7 +2258,7 @@ def get_json_object(self, json_path): >>> import cudf >>> s = cudf.Series( [ - \\"\\"\\" + \"\"\" { "store":{ "book":[ @@ -2277,13 +2277,13 @@ def get_json_object(self, json_path): ] } } - \\"\\"\\" + \"\"\" ]) >>> s - 0 {"store": {\\n "book": [\\n { "cat... + 0 {"store": {\n "book": [\n { "cat... dtype: object >>> s.str.get_json_object("$.store.book") - 0 [\\n { "category": "reference",\\n ... + 0 [\n { "category": "reference",\n ... dtype: object """ @@ -3138,7 +3138,7 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: ) def strip(self, to_strip: str = None) -> SeriesOrIndex: - """ + r""" Remove leading and trailing characters. Strip whitespaces (including newlines) or a set of @@ -3169,11 +3169,11 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex: Examples -------- >>> import cudf - >>> s = cudf.Series(['1. Ant. ', '2. Bee!\\n', '3. Cat?\\t', None]) + >>> s = cudf.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', None]) >>> s 0 1. Ant. - 1 2. Bee!\\n - 2 3. Cat?\\t + 1 2. Bee!\n + 2 3. Cat?\t 3 dtype: object >>> s.str.strip() @@ -3182,7 +3182,7 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex: 2 3. Cat? 3 dtype: object - >>> s.str.strip('123.!? \\n\\t') + >>> s.str.strip('123.!? \n\t') 0 Ant 1 Bee 2 Cat @@ -3197,7 +3197,7 @@ def strip(self, to_strip: str = None) -> SeriesOrIndex: ) def lstrip(self, to_strip: str = None) -> SeriesOrIndex: - """ + r""" Remove leading and trailing characters. Strip whitespaces (including newlines) @@ -3228,11 +3228,11 @@ def lstrip(self, to_strip: str = None) -> SeriesOrIndex: Examples -------- >>> import cudf - >>> s = cudf.Series(['1. Ant. ', '2. Bee!\\n', '3. Cat?\\t', None]) + >>> s = cudf.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', None]) >>> s.str.lstrip('123.') 0 Ant. - 1 Bee!\\n - 2 Cat?\\t + 1 Bee!\n + 2 Cat?\t 3 dtype: object """ @@ -3244,7 +3244,7 @@ def lstrip(self, to_strip: str = None) -> SeriesOrIndex: ) def rstrip(self, to_strip: str = None) -> SeriesOrIndex: - """ + r""" Remove leading and trailing characters. Strip whitespaces (including newlines) @@ -3277,14 +3277,14 @@ def rstrip(self, to_strip: str = None) -> SeriesOrIndex: Examples -------- >>> import cudf - >>> s = cudf.Series(['1. Ant. ', '2. Bee!\\n', '3. Cat?\\t', None]) + >>> s = cudf.Series(['1. Ant. ', '2. Bee!\n', '3. Cat?\t', None]) >>> s 0 1. Ant. - 1 2. Bee!\\n - 2 3. Cat?\\t + 1 2. Bee!\n + 2 3. Cat?\t 3 dtype: object - >>> s.str.rstrip('.!? \\n\\t') + >>> s.str.rstrip('.!? \n\t') 0 1. Ant 1 2. Bee 2 3. Cat @@ -3299,7 +3299,7 @@ def rstrip(self, to_strip: str = None) -> SeriesOrIndex: ) def wrap(self, width: int, **kwargs) -> SeriesOrIndex: - """ + r""" Wrap long strings in the Series/Index to be formatted in paragraphs with length less than a given width. @@ -3340,8 +3340,8 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex: >>> data = ['line to be wrapped', 'another line to be wrapped'] >>> s = cudf.Series(data) >>> s.str.wrap(12) - 0 line to be\\nwrapped - 1 another line\\nto be\\nwrapped + 0 line to be\nwrapped + 1 another line\nto be\nwrapped dtype: object """ if not is_integer(width): @@ -3575,7 +3575,7 @@ def isempty(self) -> SeriesOrIndex: return self._return_or_inplace((self._column == "").fillna(False)) def isspace(self) -> SeriesOrIndex: - """ + r""" Check whether all characters in each string are whitespace. This is equivalent to running the Python string method @@ -3623,7 +3623,7 @@ def isspace(self) -> SeriesOrIndex: Examples -------- >>> import cudf - >>> s = cudf.Series([' ', '\\t\\r\\n ', '']) + >>> s = cudf.Series([' ', '\t\r\n ', '']) >>> s.str.isspace() 0 True 1 True @@ -4271,7 +4271,7 @@ def normalize_spaces(self) -> SeriesOrIndex: ) def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: - """ + r""" Normalizes strings characters for tokenizing. This uses the normalizer that is built into the @@ -4280,7 +4280,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: - adding padding around punctuation (unicode category starts with "P") as well as certain ASCII symbols like "^" and "$" - adding padding around the CJK Unicode block characters - - changing whitespace (e.g. ``\\t``, ``\\n``, ``\\r``) to space + - changing whitespace (e.g. ``\t``, ``\n``, ``\r``) to space - removing control characters (unicode categories "Cc" and "Cf") If `do_lower_case = true`, lower-casing also removes the accents. @@ -4303,7 +4303,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: Examples -------- >>> import cudf - >>> ser = cudf.Series(["héllo, \\tworld","ĂĆCĖÑTED","$99"]) + >>> ser = cudf.Series(["héllo, \tworld","ĂĆCĖÑTED","$99"]) >>> ser.str.normalize_characters() 0 hello , world 1 accented diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 104ed3eeb67..d0e9e6d94c1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3356,7 +3356,7 @@ def to_dlpack(self): @_cudf_nvtx_annotate def to_string(self): - """ + r""" Convert to string cuDF uses Pandas internals for efficient string formatting. @@ -3373,7 +3373,7 @@ def to_string(self): >>> df['key'] = [0, 1, 2] >>> df['val'] = [float(i + 10) for i in range(3)] >>> df.to_string() - ' key val\\n0 0 10.0\\n1 1 11.0\\n2 2 12.0' + ' key val\n0 0 10.0\n1 1 11.0\n2 2 12.0' """ return repr(self) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 4ff671509a0..d813db58d1e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4614,13 +4614,13 @@ def _align_indices(series_list, how="outer", allow_non_unique=False): @_cudf_nvtx_annotate def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): - """Returns a boolean array where two arrays are equal within a tolerance. + r"""Returns a boolean array where two arrays are equal within a tolerance. Two values in ``a`` and ``b`` are considered equal when the following equation is satisfied. .. math:: - |a - b| \\le \\mathrm{atol} + \\mathrm{rtol} |b| + |a - b| \le \mathrm{atol} + \mathrm{rtol} |b| Parameters ---------- From 15e49824a8cb2a5a7ec6a6e5f273589a66f1c120 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 29 Apr 2022 10:22:10 -0500 Subject: [PATCH 12/28] Enable pydocstyle for all packages. (#10759) Follow-up to #10748 to enable the base pydocstyle rules on all Python packages (`dask_cudf`, `cudf_kafka`, `custreamz`) and test files. Contributes to #10711, #10758. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/10759 --- .pre-commit-config.yaml | 6 +----- python/.flake8 | 2 +- python/custreamz/custreamz/kafka.py | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cd7b8aea6d7..46d5223f7d3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -65,11 +65,7 @@ repos: conda| docs| java| - notebooks| - python/dask_cudf| - python/cudf_kafka| - python/custreamz| - python/cudf/cudf/tests + notebooks ) - repo: https://github.com/pre-commit/mirrors-clang-format rev: v11.1.0 diff --git a/python/.flake8 b/python/.flake8 index 667875030cc..b763c209fc1 100644 --- a/python/.flake8 +++ b/python/.flake8 @@ -15,7 +15,7 @@ ignore = # unlike the match option above this match-dir will have no effect when # pydocstyle is invoked from pre-commit. Therefore this exclusion list must # also be maintained in the pre-commit config file. -match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks|dask_cudf|cudf_kafka|custreamz|tests)).*$ +match-dir = ^(?!(ci|cpp|conda|docs|java|notebooks)).*$ # Allow missing docstrings for docutils ignore-decorators = .*(docutils|doc_apply|copy_docstring).* select = diff --git a/python/custreamz/custreamz/kafka.py b/python/custreamz/custreamz/kafka.py index f5d5031602f..0198757c68d 100644 --- a/python/custreamz/custreamz/kafka.py +++ b/python/custreamz/custreamz/kafka.py @@ -95,7 +95,7 @@ def read_gdf( message_format="json", ): - """ + r""" Read messages from the underlying KafkaDatasource connection and create a cudf Dataframe From 3c4e72e68d9406d65939b7d2fdf28b0b921840dd Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Fri, 29 Apr 2022 21:24:12 +0530 Subject: [PATCH 13/28] Add row hasher with nested column support (#10641) Contributes to #10186 Authors: - Devavret Makkar (https://github.com/devavret) - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/10641 --- cpp/benchmarks/stream_compaction/distinct.cpp | 41 +++ cpp/include/cudf/detail/hashing.hpp | 5 +- cpp/include/cudf/detail/iterator.cuh | 8 +- .../cudf/detail/utilities/algorithm.cuh | 28 ++ cpp/include/cudf/detail/utilities/column.hpp | 10 +- .../cudf/table/experimental/row_operators.cuh | 273 +++++++++++++++--- cpp/src/hash/hashing.cu | 29 +- cpp/src/hash/murmur_hash.cu | 28 +- cpp/src/stream_compaction/distinct.cu | 18 +- .../stream_compaction_common.cuh | 22 ++ cpp/src/table/row_operators.cu | 60 ++-- cpp/tests/hashing/hash_test.cpp | 224 +++++++++++++- cpp/tests/reductions/list_rank_test.cpp | 4 +- .../stream_compaction/distinct_tests.cpp | 242 ++++++++++++++++ python/cudf/cudf/tests/test_dataframe.py | 2 +- 15 files changed, 880 insertions(+), 114 deletions(-) create mode 100644 cpp/include/cudf/detail/utilities/algorithm.cuh diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index 749badc715d..149c6ad7219 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -55,3 +56,43 @@ NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type)) .set_name("distinct") .set_type_axes_names({"Type"}) .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); + +template +void nvbench_distinct_list(nvbench::state& state, nvbench::type_list) +{ + cudf::rmm_pool_raii pool_raii; + + auto const size = state.get_int64("ColumnSize"); + auto const dtype = cudf::type_to_id(); + double const null_frequency = state.get_float64("null_frequency"); + + data_profile table_data_profile; + if (dtype == cudf::type_id::LIST) { + table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 4); + table_data_profile.set_distribution_params( + cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4); + table_data_profile.set_list_depth(1); + } else { + // We're comparing distinct() on a non-nested column to that on a list column with the same + // number of distinct rows. The max list size is 4 and the number of distinct values in the + // list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 + 5^4 = 781 + // We want this column to also have 781 distinct values. + table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 781); + } + table_data_profile.set_null_frequency(null_frequency); + + auto const table = create_random_table( + {dtype}, table_size_bytes{static_cast(size)}, table_data_profile, 0); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::detail::distinct(*table, {0}, cudf::null_equality::EQUAL, stream_view); + }); +} + +NVBENCH_BENCH_TYPES(nvbench_distinct_list, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("distinct_list") + .set_type_axes_names({"Type"}) + .add_float64_axis("null_frequency", {0.0, 0.1}) + .add_int64_axis("ColumnSize", {100'000'000}); diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp index e8e100aaec5..9958fa8f3a4 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/detail/hashing.hpp @@ -33,19 +33,20 @@ namespace detail { std::unique_ptr hash( table_view const& input, hash_id hash_function = hash_id::HASH_MURMUR3, - uint32_t seed = 0, + uint32_t seed = cudf::DEFAULT_HASH_SEED, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr murmur_hash3_32( table_view const& input, + uint32_t seed = cudf::DEFAULT_HASH_SEED, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); template