diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch index ae1962e4738..0dd9854d4aa 100644 --- a/cpp/cmake/thrust.patch +++ b/cpp/cmake/thrust.patch @@ -114,3 +114,29 @@ index d0e3f94..76774b0 100644 /** * Dispatch between 32-bit and 64-bit index based versions of the same algorithm * implementation. This version allows using different token sequences for callables +diff --git a/thrust/iterator/transform_input_output_iterator.h b/thrust/iterator/transform_input_output_iterator.h +index f512a36..a5f725d 100644 +--- a/thrust/iterator/transform_input_output_iterator.h ++++ b/thrust/iterator/transform_input_output_iterator.h +@@ -102,6 +102,8 @@ template + /*! \endcond + */ + ++ transform_input_output_iterator() = default; ++ + /*! This constructor takes as argument a \c Iterator an \c InputFunction and an + * \c OutputFunction and copies them to a new \p transform_input_output_iterator + * +diff --git a/thrust/iterator/transform_output_iterator.h b/thrust/iterator/transform_output_iterator.h +index 66fb46a..4a68cb5 100644 +--- a/thrust/iterator/transform_output_iterator.h ++++ b/thrust/iterator/transform_output_iterator.h +@@ -104,6 +104,8 @@ template + /*! \endcond + */ + ++ transform_output_iterator() = default; ++ + /*! This constructor takes as argument an \c OutputIterator and an \c + * UnaryFunction and copies them to a new \p transform_output_iterator + * diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp index cf21da1b030..f43089210fd 100644 --- a/cpp/include/cudf/sorting.hpp +++ b/cpp/include/cudf/sorting.hpp @@ -207,9 +207,31 @@ std::unique_ptr rank( /** * @brief Returns sorted order after sorting each segment in the table. * - * If segment_offsets contains values larger than number of rows, behavior is undefined. + * If segment_offsets contains values larger than the number of rows, the behavior is undefined. * @throws cudf::logic_error if `segment_offsets` is not `size_type` column. * + * @code{.pseudo} + * Example: + * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} } + * offsets = {0, 3, 7, 10} + * result = cudf::segmented_sorted_order(keys, offsets); + * result is { 2,1,0, 6,5,4,3, 9,8,7 } + * @endcode + * + * If segment_offsets is empty or contains a single index, no values are sorted + * and the result is a sequence of integers from 0 to keys.size()-1. + * + * The segment_offsets are not required to include all indices. Any indices + * outside the specified segments will not be sorted. + * + * @code{.pseudo} + * Example: (offsets do not cover all indices) + * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} } + * offsets = {3, 7} + * result = cudf::segmented_sorted_order(keys, offsets); + * result is { 0,1,2, 6,5,4,3, 7,8,9 } + * @endcode + * * @param keys The table that determines the ordering of elements in each segment * @param segment_offsets The column of `size_type` type containing start offset index for each * contiguous segment. @@ -246,10 +268,34 @@ std::unique_ptr stable_segmented_sorted_order( /** * @brief Performs a lexicographic segmented sort of a table * - * If segment_offsets contains values larger than number of rows, behavior is undefined. + * If segment_offsets contains values larger than the number of rows, the behavior is undefined. * @throws cudf::logic_error if `values.num_rows() != keys.num_rows()`. * @throws cudf::logic_error if `segment_offsets` is not `size_type` column. * + * @code{.pseudo} + * Example: + * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} } + * values = { {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'} } + * offsets = {0, 3, 7, 10} + * result = cudf::segmented_sort_by_key(keys, values, offsets); + * result is { 'c','b','a', 'g','f','e','d', 'j','i','h' } + * @endcode + * + * If segment_offsets is empty or contains a single index, no values are sorted + * and the result is a copy of the values. + * + * The segment_offsets are not required to include all indices. Any indices + * outside the specified segments will not be sorted. + * + * @code{.pseudo} + * Example: (offsets do not cover all indices) + * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} } + * values = { {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'} } + * offsets = {3, 7} + * result = cudf::segmented_sort_by_key(keys, values, offsets); + * result is { 'a','b','c', 'g','f','e','d', 'h','i','j' } + * @endcode + * * @param values The table to reorder * @param keys The table that determines the ordering of elements in each segment * @param segment_offsets The column of `size_type` type containing start offset index for each diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index d54bb5c8ea9..872e742a5af 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -403,7 +403,7 @@ void make_device_json_column(device_span input, std::string name = ""; auto parent_col_id = column_parent_ids[this_col_id]; if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { - name = "element"; + name = list_child_name; } else if (column_categories[parent_col_id] == NC_FN) { auto field_name_col_id = parent_col_id; parent_col_id = column_parent_ids[parent_col_id]; @@ -689,19 +689,24 @@ std::pair, std::vector> device_json_co size_type num_rows = json_col.child_offsets.size() - 1; std::vector column_names{}; column_names.emplace_back("offsets"); - column_names.emplace_back(json_col.child_columns.begin()->first); + column_names.emplace_back( + json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first); // Note: json_col modified here, reuse the memory auto offsets_column = std::make_unique( data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release()); // Create children column auto [child_column, names] = - device_json_column_to_cudf_column(json_col.child_columns.begin()->second, - d_input, - options, - get_child_schema(json_col.child_columns.begin()->first), - stream, - mr); + json_col.child_columns.empty() + ? std::pair, + std::vector>{std::make_unique(), {}} + : device_json_column_to_cudf_column( + json_col.child_columns.begin()->second, + d_input, + options, + get_child_schema(json_col.child_columns.begin()->first), + stream, + mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); return {make_lists_column(num_rows, diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index dbf026c351e..cf041b02a20 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -29,6 +29,8 @@ #include +#include + #include #include #include @@ -39,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -125,6 +128,75 @@ struct node_ranges { } }; +/** + * @brief Returns stable sorted keys and its sorted order + * + * Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory. + * Since the key and order is returned, using double buffer helps to avoid extra copy to user + * provided output iterator. + * + * @tparam IndexType sorted order type + * @tparam KeyType key type + * @param keys keys to sort + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return Sorted keys and indices producing that sorted order + */ +template +std::pair, rmm::device_uvector> stable_sorted_key_order( + cudf::device_span keys, rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + + // Determine temporary device storage requirements + rmm::device_uvector keys_buffer1(keys.size(), stream); + rmm::device_uvector keys_buffer2(keys.size(), stream); + rmm::device_uvector order_buffer1(keys.size(), stream); + rmm::device_uvector order_buffer2(keys.size(), stream); + cub::DoubleBuffer order_buffer(order_buffer1.data(), order_buffer2.data()); + cub::DoubleBuffer keys_buffer(keys_buffer1.data(), keys_buffer2.data()); + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size()); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + + thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin()); + thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end()); + + cub::DeviceRadixSort::SortPairs( + d_temp_storage.data(), temp_storage_bytes, keys_buffer, order_buffer, keys.size()); + + return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1) + : std::move(keys_buffer2), + order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1) + : std::move(order_buffer2)}; +} + +/** + * @brief Propagate parent node to siblings from first sibling. + * + * @param node_levels Node levels of each node + * @param parent_node_ids parent node ids initialized for first child of each push node, + * and other siblings are initialized to -1. + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +void propagate_parent_to_siblings(cudf::device_span node_levels, + cudf::device_span parent_node_ids, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + auto [sorted_node_levels, sorted_order] = stable_sorted_key_order(node_levels, stream); + // instead of gather, using permutation_iterator, which is ~17% faster + + thrust::inclusive_scan_by_key( + rmm::exec_policy(stream), + sorted_node_levels.begin(), + sorted_node_levels.end(), + thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), + thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), + thrust::equal_to{}, + thrust::maximum{}); +} + // Generates a tree representation of the given tokens, token_indices. tree_meta_t get_tree_representation(device_span tokens, device_span token_indices, @@ -166,12 +238,86 @@ tree_meta_t get_tree_representation(device_span tokens, }; auto num_tokens = tokens.size(); - auto is_node_it = thrust::make_transform_iterator( - tokens.begin(), - [is_node] __device__(auto t) -> size_type { return static_cast(is_node(t)); }); - auto num_nodes = thrust::count_if( + auto num_nodes = thrust::count_if( rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node); + // Node levels: transform_exclusive_scan, copy_if. + rmm::device_uvector node_levels(num_nodes, stream, mr); + { + rmm::device_uvector token_levels(num_tokens, stream); + auto push_pop_it = thrust::make_transform_iterator( + tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type { + return does_push(token) - does_pop(token); + }); + thrust::exclusive_scan( + rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin()); + + auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream), + token_levels.begin(), + token_levels.begin() + num_tokens, + tokens.begin(), + node_levels.begin(), + is_node); + CUDF_EXPECTS(thrust::distance(node_levels.begin(), node_levels_end) == num_nodes, + "node level count mismatch"); + } + + // Node parent ids: + // previous push node_id transform, stable sort by level, segmented scan with Max, reorder. + rmm::device_uvector parent_node_ids(num_nodes, stream, mr); + // This block of code is generalized logical stack algorithm. TODO: make this a seperate function. + { + rmm::device_uvector node_token_ids(num_nodes, stream); + thrust::copy_if(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(0) + num_tokens, + tokens.begin(), + node_token_ids.begin(), + is_node); + + // previous push node_id + // if previous node is a push, then i-1 + // if previous node is FE, then i-2 (returns FB's index) + // if previous node is SMB and its previous node is a push, then i-2 + // eg. `{ SMB FB FE VB VE SME` -> `{` index as FB's parent. + // else -1 + auto first_childs_parent_token_id = [tokens_gpu = + tokens.begin()] __device__(auto i) -> NodeIndexT { + if (i <= 0) { return -1; } + if (tokens_gpu[i - 1] == token_t::StructBegin or tokens_gpu[i - 1] == token_t::ListBegin) { + return i - 1; + } else if (tokens_gpu[i - 1] == token_t::FieldNameEnd) { + return i - 2; + } else if (tokens_gpu[i - 1] == token_t::StructMemberBegin and + (tokens_gpu[i - 2] == token_t::StructBegin || + tokens_gpu[i - 2] == token_t::ListBegin)) { + return i - 2; + } else { + return -1; + } + }; + + thrust::transform( + rmm::exec_policy(stream), + node_token_ids.begin(), + node_token_ids.end(), + parent_node_ids.begin(), + [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__( + NodeIndexT const tid) -> NodeIndexT { + auto pid = first_childs_parent_token_id(tid); + return pid < 0 + ? parent_node_sentinel + : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) - + node_ids_gpu; + // parent_node_sentinel is -1, useful for segmented max operation below + }); + } + // Propagate parent node to siblings from first sibling - inplace. + propagate_parent_to_siblings( + cudf::device_span{node_levels.data(), node_levels.size()}, + parent_node_ids, + stream); + // Node categories: copy_if with transform. rmm::device_uvector node_categories(num_nodes, stream, mr); auto node_categories_it = @@ -184,24 +330,6 @@ tree_meta_t get_tree_representation(device_span tokens, CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes, "node category count mismatch"); - // Node levels: transform_exclusive_scan, copy_if. - rmm::device_uvector token_levels(num_tokens, stream); - auto push_pop_it = thrust::make_transform_iterator( - tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type { - return does_push(token) - does_pop(token); - }); - thrust::exclusive_scan( - rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin()); - - rmm::device_uvector node_levels(num_nodes, stream, mr); - auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream), - token_levels.begin(), - token_levels.begin() + num_tokens, - tokens.begin(), - node_levels.begin(), - is_node); - CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch"); - // Node ranges: copy_if with transform. rmm::device_uvector node_range_begin(num_nodes, stream, mr); rmm::device_uvector node_range_end(num_nodes, stream, mr); @@ -223,69 +351,6 @@ tree_meta_t get_tree_representation(device_span tokens, }); CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch"); - // Node parent ids: previous push token_id transform, stable sort, segmented scan with Max, - // reorder, copy_if. This one is sort of logical stack. But more generalized. - // TODO: make it own function. - rmm::device_uvector parent_token_ids(num_tokens, stream); - rmm::device_uvector initial_order(num_tokens, stream); - // TODO re-write the algorithm to work only on nodes, not tokens. - - thrust::sequence(rmm::exec_policy(stream), initial_order.begin(), initial_order.end()); - thrust::tabulate(rmm::exec_policy(stream), - parent_token_ids.begin(), - parent_token_ids.end(), - [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type { - return (i > 0) && does_push(tokens_gpu[i - 1]) ? i - 1 : -1; - // -1, not sentinel used here because of max operation below - }); - - auto out_pid = thrust::make_zip_iterator(parent_token_ids.data(), initial_order.data()); - // Uses radix sort for builtin types. - thrust::stable_sort_by_key(rmm::exec_policy(stream), - token_levels.data(), - token_levels.data() + token_levels.size(), - out_pid); - - // SegmentedScan Max. - thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - token_levels.data(), - token_levels.data() + token_levels.size(), - parent_token_ids.data(), - parent_token_ids.data(), - thrust::equal_to{}, - thrust::maximum{}); - - // scatter to restore the original order. - { - rmm::device_uvector temp_storage(num_tokens, stream); - thrust::scatter(rmm::exec_policy(stream), - parent_token_ids.begin(), - parent_token_ids.end(), - initial_order.begin(), - temp_storage.begin()); - thrust::copy( - rmm::exec_policy(stream), temp_storage.begin(), temp_storage.end(), parent_token_ids.begin()); - } - - rmm::device_uvector node_ids_gpu(num_tokens, stream); - thrust::exclusive_scan( - rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin()); - - rmm::device_uvector parent_node_ids(num_nodes, stream, mr); - auto parent_node_ids_it = thrust::make_transform_iterator( - parent_token_ids.begin(), - [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT { - return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid]; - }); - auto parent_node_ids_end = thrust::copy_if(rmm::exec_policy(stream), - parent_node_ids_it, - parent_node_ids_it + parent_token_ids.size(), - tokens.begin(), - parent_node_ids.begin(), - is_node); - CUDF_EXPECTS(parent_node_ids_end - parent_node_ids.begin() == num_nodes, - "parent node id gather mismatch"); - return {std::move(node_categories), std::move(parent_node_ids), std::move(node_levels), diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 10d209b2ea6..8a0f3566d58 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -104,6 +104,9 @@ enum node_t : NodeT { */ enum class json_col_t : char { ListColumn, StructColumn, StringColumn, Unknown }; +// Default name for a list's child column +constexpr auto list_child_name{"element"}; + /** * @brief Intermediate representation of data from a nested JSON input */ diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 5d60a564b9b..29a29a1f9d5 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1162,9 +1162,6 @@ void make_json_column(json_column& root_column, // Range of encapsulating function that parses to internal columnar data representation CUDF_FUNC_RANGE(); - // Default name for a list's child column - std::string const list_child_name = "element"; - // Parse the JSON and get the token stream const auto [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr); @@ -1286,7 +1283,7 @@ void make_json_column(json_column& root_column, * (b) a list, the selected child column corresponds to single child column of * the list column. In this case, the child column may not exist yet. */ - auto get_selected_column = [&list_child_name](std::stack& current_data_path) { + auto get_selected_column = [](std::stack& current_data_path) { json_column* selected_col = current_data_path.top().current_selected_col; // If the node does not have a selected column yet @@ -1680,7 +1677,8 @@ std::pair, std::vector> json_column_to size_type num_rows = json_col.child_offsets.size(); std::vector column_names{}; column_names.emplace_back("offsets"); - column_names.emplace_back(json_col.child_columns.begin()->first); + column_names.emplace_back( + json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first); rmm::device_uvector d_offsets = cudf::detail::make_device_uvector_async(json_col.child_offsets, stream, mr); @@ -1688,12 +1686,15 @@ std::pair, std::vector> json_column_to std::make_unique(data_type{type_id::INT32}, num_rows, d_offsets.release()); // Create children column auto [child_column, names] = - json_column_to_cudf_column(json_col.child_columns.begin()->second, - d_input, - options, - get_child_schema(json_col.child_columns.begin()->first), - stream, - mr); + json_col.child_columns.empty() + ? std::pair, + std::vector>{std::make_unique(), {}} + : json_column_to_cudf_column(json_col.child_columns.begin()->second, + d_input, + options, + get_child_schema(json_col.child_columns.begin()->first), + stream, + mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); return {make_lists_column(num_rows - 1, diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu index a5f6d737637..57d55be6145 100644 --- a/cpp/src/io/parquet/page_data.cu +++ b/cpp/src/io/parquet/page_data.cu @@ -1860,11 +1860,8 @@ void PreprocessColumnData(hostdevice_vector& pages, out_buf.create(size, stream, mr); } - // for nested hierarchies, compute per-page start offset. - // it would be better/safer to be checking (schema.max_repetition_level > 0) here, but there's - // no easy way to get at that info here. we'd have to move this function into reader_impl.cu - if ((out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) || - out_buf.type.id() == type_id::LIST) { + // for nested hierarchies, compute per-page start offset + if (input_col.has_repetition) { thrust::exclusive_scan_by_key(rmm::exec_policy(stream), page_keys.begin(), page_keys.end(), diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 8f4cd5c6f3b..1a8c0f4cd9e 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -57,9 +57,16 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1; struct input_column_info { int schema_idx; std::string name; + bool has_repetition; // size == nesting depth. the associated real output // buffer index in the dest column for each level of nesting. std::vector nesting; + + input_column_info(int _schema_idx, std::string _name, bool _has_repetition) + : schema_idx(_schema_idx), name(_name), has_repetition(_has_repetition) + { + } + auto nesting_depth() const { return nesting.size(); } }; diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu index 07869189089..0997d2a968d 100644 --- a/cpp/src/io/parquet/reader_impl.cu +++ b/cpp/src/io/parquet/reader_impl.cu @@ -684,8 +684,8 @@ class aggregate_reader_metadata { // if I have no children, we're at a leaf and I'm an input column (that is, one with actual // data stored) so add me to the list. if (schema_elem.num_children == 0) { - input_column_info& input_col = - input_columns.emplace_back(input_column_info{schema_idx, schema_elem.name}); + input_column_info& input_col = input_columns.emplace_back( + input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0}); // set up child output column for one-level encoding list if (schema_elem.is_one_level_list()) { diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu index cb9cd4293b5..25094536cce 100644 --- a/cpp/src/lists/dremel.cu +++ b/cpp/src/lists/dremel.cu @@ -225,6 +225,7 @@ dremel_data get_dremel_data(column_view h_col, cudf::detail::device_single_thread( [offset_at_level = d_column_offsets.data(), end_idx_at_level = d_column_ends.data(), + level_max = d_column_offsets.size(), col = *d_col] __device__() { auto curr_col = col; size_type off = curr_col.offset(); @@ -239,9 +240,11 @@ dremel_data get_dremel_data(column_view h_col, if (curr_col.type().id() == type_id::LIST) { off = curr_col.child(lists_column_view::offsets_column_index).element(off); end = curr_col.child(lists_column_view::offsets_column_index).element(end); - offset_at_level[level] = off; - end_idx_at_level[level] = end; - ++level; + if (level < level_max) { + offset_at_level[level] = off; + end_idx_at_level[level] = end; + ++level; + } curr_col = curr_col.child(lists_column_view::child_column_index); } else { curr_col = curr_col.child(0); diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu index 3422330bf8b..c5f13df5305 100644 --- a/cpp/src/sort/segmented_sort.cu +++ b/cpp/src/sort/segmented_sort.cu @@ -24,7 +24,6 @@ #include #include -#include namespace cudf { namespace detail { @@ -35,24 +34,49 @@ namespace { */ enum class sort_method { STABLE, UNSTABLE }; -// returns segment indices for each element for all segments. -// first segment begin index = 0, last segment end index = num_rows. +/** + * @brief Builds indices to identify segments to sort + * + * The segments are added to the input table-view keys so they + * are lexicographically sorted within the segmented groups. + * + * ``` + * Example 1: + * num_rows = 10 + * offsets = {0, 3, 7, 10} + * segment-indices -> { 3,3,3, 7,7,7,7, 10,10,10 } + * ``` + * + * ``` + * Example 2: (offsets do not cover all indices) + * num_rows = 10 + * offsets = {3, 7} + * segment-indices -> { 0,1,2, 7,7,7,7, 8,9,10 } + * ``` + * + * @param num_rows Total number of rows in the input keys to sort + * @param offsets The offsets identifying the segments + * @param stream CUDA stream used for device memory operations and kernel launches + */ rmm::device_uvector get_segment_indices(size_type num_rows, column_view const& offsets, rmm::cuda_stream_view stream) { rmm::device_uvector segment_ids(num_rows, stream); - auto offset_begin = offsets.begin(); // assumes already offset column contains offset. - auto offsets_minus_one = thrust::make_transform_iterator( - offset_begin, [offset_begin] __device__(auto i) { return i - 1; }); + auto offset_begin = offsets.begin(); + auto offset_end = offsets.end(); auto counting_iter = thrust::make_counting_iterator(0); - thrust::lower_bound(rmm::exec_policy(stream), - offsets_minus_one, - offsets_minus_one + offsets.size(), - counting_iter, - counting_iter + segment_ids.size(), - segment_ids.begin()); + thrust::transform(rmm::exec_policy(stream), + counting_iter, + counting_iter + segment_ids.size(), + segment_ids.begin(), + [offset_begin, offset_end] __device__(auto idx) { + if (offset_begin == offset_end || idx < *offset_begin) { return idx; } + if (idx >= *(offset_end - 1)) { return idx + 1; } + return static_cast( + *thrust::upper_bound(thrust::seq, offset_begin, offset_end, idx)); + }); return segment_ids; } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index d7ab881861a..b8cd4622484 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -813,7 +813,6 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsOutOfOrder) cudf::test::strings_column_wrapper({"aaa", "bbb"})); } -/* // currently, the json reader is strict about having non-empty input. TEST_F(JsonReaderTest, EmptyFile) { @@ -824,7 +823,9 @@ TEST_F(JsonReaderTest, EmptyFile) } cudf::io::json_reader_options in_options = - cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true); + cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) + .lines(true) + .experimental(true); auto result = cudf::io::read_json(in_options); const auto view = result.tbl->view(); @@ -832,6 +833,7 @@ TEST_F(JsonReaderTest, EmptyFile) } // currently, the json reader is strict about having non-empty input. +// experimental reader supports empty input TEST_F(JsonReaderTest, NoDataFile) { auto filepath = temp_env->get_temp_dir() + "NoDataFile.csv"; @@ -841,13 +843,14 @@ TEST_F(JsonReaderTest, NoDataFile) } cudf::io::json_reader_options in_options = - cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true); + cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) + .lines(true) + .experimental(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); const auto view = result.tbl->view(); EXPECT_EQ(0, view.num_columns()); } -*/ TEST_F(JsonReaderTest, ArrowFileSource) { diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp index 3d024fe8af8..6f7e28a2ca3 100644 --- a/cpp/tests/io/json_tree.cpp +++ b/cpp/tests/io/json_tree.cpp @@ -773,7 +773,11 @@ std::vector json_lines_list = { { "a": { "y" : 6, "z": [] }} { "a": { "y" : 6, "z": [2, 3, 4, 5] }} { "a": { "z": [4], "y" : 6 }} - { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )"}; + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )", + // empty list, row. + R"( {"a" : [], "b" : {}} + {"a" : []} + {"b" : {}})"}; INSTANTIATE_TEST_SUITE_P(Mixed_And_Records, JsonTreeTraversalTest, ::testing::Combine(::testing::Values(false), diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index 134eff54144..6f1c5ef7eb1 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -2633,6 +2633,11 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes) 0, [string_per_row](cudf::size_type idx) { return idx * string_per_row; }); cudf::test::fixed_width_column_wrapper offsets(offset_iter, offset_iter + num_rows + 1); + + auto _c3_valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 200; }); + std::vector c3_valids(num_rows); + std::copy(_c3_valids, _c3_valids + num_rows, c3_valids.begin()); auto _c3_list = cudf::make_lists_column(num_rows, offsets.release(), @@ -2646,7 +2651,7 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes) c3_children.push_back(std::move(c3_list)); c3_children.push_back(c3_ints.release()); c3_children.push_back(c3_floats.release()); - cudf::test::structs_column_wrapper _c3(std::move(c3_children)); + cudf::test::structs_column_wrapper _c3(std::move(c3_children), c3_valids); auto c3 = cudf::purge_nonempty_nulls(static_cast(_c3)); // write it out diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp index 4fd62f9b938..a8547ea982d 100644 --- a/cpp/tests/reductions/segmented_reduction_tests.cpp +++ b/cpp/tests/reductions/segmented_reduction_tests.cpp @@ -655,9 +655,9 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction) auto const input = fixed_width_column_wrapper{ {1, 2, 3, 4, 5, 6, 7}, {true, true, true, true, true, true, true}}; - auto const offsets = std::vector{0, 1, 3, 4}; + auto const offsets = std::vector{1, 3, 4}; auto const d_offsets = thrust::device_vector(offsets); - auto const expect = fixed_width_column_wrapper{{1, 5, 4}, {true, true, true}}; + auto const expect = fixed_width_column_wrapper{{5, 4}, {true, true}}; auto res = segmented_reduce(input, d_offsets, @@ -669,7 +669,7 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction) // Test with initial value auto const init_scalar = cudf::make_fixed_width_scalar(3); - auto const init_expect = fixed_width_column_wrapper{{4, 8, 7}, {true, true, true}}; + auto const init_expect = fixed_width_column_wrapper{{8, 7}, {true, true}}; res = segmented_reduce(input, d_offsets, @@ -681,8 +681,7 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction) // Test with null initial value init_scalar->set_valid_async(false); - auto null_init_expect = - fixed_width_column_wrapper{{XXX, XXX, XXX}, {false, false, false}}; + auto null_init_expect = fixed_width_column_wrapper{{XXX, XXX}, {false, false}}; res = segmented_reduce(input, d_offsets, diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp index fb07bfde795..53642a89b3d 100644 --- a/cpp/tests/sort/segmented_sort_tests.cpp +++ b/cpp/tests/sort/segmented_sort_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -201,9 +201,13 @@ TEST_F(SegmentedSortInt, NonZeroSegmentsStart) column_wrapper segments1{{0, 2, 5, 8, 11}}; column_wrapper segments2{{ 2, 5, 8, 11}}; column_wrapper segments3{{ 6, 8, 11}}; + column_wrapper segments4{{ 6, 8}}; + column_wrapper segments5{{0, 3, 6}}; column_wrapper expected1{{0, 1, 2, 4, 3, 7, 5, 6, 9, 10, 8}}; column_wrapper expected2{{0, 1, 2, 4, 3, 7, 5, 6, 9, 10, 8}}; - column_wrapper expected3{{2, 4, 5, 3, 0, 1, 7, 6, 9, 10, 8}}; + column_wrapper expected3{{0, 1, 2, 3, 4, 5, 7, 6, 9, 10, 8}}; + column_wrapper expected4{{0, 1, 2, 3, 4, 5, 7, 6, 8, 9, 10}}; + column_wrapper expected5{{2, 0, 1, 4, 5, 3, 6, 7, 8, 9, 10}}; // clang-format on table_view input{{col1}}; auto results = cudf::detail::segmented_sorted_order(input, segments1); @@ -212,6 +216,10 @@ TEST_F(SegmentedSortInt, NonZeroSegmentsStart) CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected2); results = cudf::detail::segmented_sorted_order(input, segments3); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected3); + results = cudf::detail::segmented_sorted_order(input, segments4); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected4); + results = cudf::detail::segmented_sorted_order(input, segments5); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected5); } TEST_F(SegmentedSortInt, Sliced) @@ -219,13 +227,13 @@ TEST_F(SegmentedSortInt, Sliced) using T = int; // clang-format off column_wrapper col1{{8, 9, 2, 3, 2, 2, 4, 1, 7, 5, 6}}; - // sliced 2, 2, 4, 1, 7, 5, 6 + // sliced 2, 2, 4, 1, 7, 5, 6 column_wrapper segments1{{0, 2, 5}}; column_wrapper segments2{{-4, 0, 2, 5}}; column_wrapper segments3{{ 7}}; column_wrapper expected1{{0, 1, 3, 2, 4, 5, 6}}; column_wrapper expected2{{0, 1, 3, 2, 4, 5, 6}}; - column_wrapper expected3{{3, 0, 1, 2, 5, 6, 4}}; + column_wrapper expected3{{0, 1, 2, 3, 4, 5, 6}}; // clang-format on auto slice = cudf::slice(col1, {4, 11})[0]; // 7 elements table_view input{{slice}}; diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 1fdef44546a..fb2c24b3757 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -649,6 +649,24 @@ def test_json_nested_data(): assert df.to_arrow().equals(pa_table_pdf) +def test_json_empty_types(): + json_str = """ {} + {"a": [], "b": {}} + {"a": []} + {"b": {}} + {"c": {"d": []}} + {"e": [{}]} + """ + df = cudf.read_json( + StringIO(json_str), + engine="cudf_experimental", + orient="records", + lines=True, + ) + pdf = pd.read_json(StringIO(json_str), orient="records", lines=True) + assert_eq(df, pdf) + + def test_json_types_data(): # 0:<0:string,1:float> # 1:list diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 366b2e0ebae..0a0647f1297 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -451,7 +451,7 @@ """ doc_to_orc = docfmt_partial(docstring=_docstring_to_orc) -_docstring_read_json = """ +_docstring_read_json = r""" Load a JSON dataset into a DataFrame Parameters @@ -466,8 +466,13 @@ engine : {{ 'auto', 'cudf', 'cudf_experimental', 'pandas' }}, default 'auto' Parser engine to use. If 'auto' is passed, the engine will be automatically selected based on the other parameters. -orient : string, - Indication of expected JSON string format (pandas engine only). +orient : string + + .. admonition:: Not GPU-accelerated + + This parameter is only supported with ``engine='pandas'``. + + Indication of expected JSON string format. Compatible JSON strings can be produced by ``to_json()`` with a corresponding orient value. The set of possible orients is: @@ -500,12 +505,23 @@ typ : type of object to recover (series or frame), default 'frame' With cudf engine, only frame output is supported. dtype : boolean or dict, default True - If True, infer dtypes, if a dict of column to dtype, then use those, - if False, then don't infer dtypes at all, applies only to the data. + If True, infer dtypes for all columns; if False, then don't infer dtypes at all, + if a dict, provide a mapping from column names to their respective dtype (any missing + columns will have their dtype inferred). Applies only to the data. convert_axes : boolean, default True - Try to convert the axes to the proper dtypes (pandas engine only). + + .. admonition:: Not GPU-accelerated + + This parameter is only supported with ``engine='pandas'``. + + Try to convert the axes to the proper dtypes. convert_dates : boolean, default True - List of columns to parse for dates (pandas engine only); If True, then try + + .. admonition:: Not GPU-accelerated + + This parameter is only supported with ``engine='pandas'``. + + List of columns to parse for dates; If True, then try to parse datelike columns default is True; a column label is datelike if * it ends with ``'_at'``, @@ -514,27 +530,57 @@ * it is ``'modified'``, or * it is ``'date'`` keep_default_dates : boolean, default True - If parsing dates, parse the default datelike columns (pandas engine only) + + .. admonition:: Not GPU-accelerated + + This parameter is only supported with ``engine='pandas'``. + + If parsing dates, parse the default datelike columns. numpy : boolean, default False - Direct decoding to numpy arrays (pandas engine only). Supports numeric + + .. admonition:: Not GPU-accelerated + + This parameter is only supported with ``engine='pandas'``. + + Direct decoding to numpy arrays. Supports numeric data only, but non-numeric column and index labels are supported. Note also that the JSON ordering MUST be the same for each term if numpy=True. precise_float : boolean, default False + + .. admonition:: Not GPU-accelerated + + This parameter is only supported with ``engine='pandas'``. + Set to enable usage of higher precision (strtod) function when decoding string to double values (pandas engine only). Default (False) is to use fast but less precise builtin functionality date_unit : string, default None - The timestamp unit to detect if converting dates (pandas engine only). + + .. admonition:: Not GPU-accelerated + + This parameter is only supported with ``engine='pandas'``. + + The timestamp unit to detect if converting dates. The default behavior is to try and detect the correct precision, but if this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force parsing only seconds, milliseconds, microseconds or nanoseconds. encoding : str, default is 'utf-8' + + .. admonition:: Not GPU-accelerated + + This parameter is only supported with ``engine='pandas'``. + The encoding to use to decode py3 bytes. With cudf engine, only utf-8 is supported. lines : boolean, default False Read the file as a json object per line. chunksize : integer, default None - Return JsonReader object for iteration (pandas engine only). + + .. admonition:: Not GPU-accelerated + + This parameter is only supported with ``engine='pandas'``. + + Return JsonReader object for iteration. See the `line-delimited json docs `_ for more information on ``chunksize``. @@ -547,12 +593,22 @@ otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. byte_range : list or tuple, default None - Byte range within the input file to be read (cudf engine only). + + .. admonition:: GPU-accelerated + + This parameter is only supported with ``engine='cudf'``. + + Byte range within the input file to be read. The first number is the offset in bytes, the second number is the range size in bytes. Set the size to zero to read all data after the offset location. Reads the row that starts before or at the end of the range, even if it ends after the end of the range. keep_quotes : bool, default False + + .. admonition:: GPU-accelerated experimental feature + + This parameter is only supported with ``engine='cudf_experimental'``. + This parameter is only supported in ``cudf_experimental`` engine. If `True`, any string values are read literally (and wrapped in an additional set of quotes).