Merge branch 'branch-22.12' into example-strings

rapidsai · Oct 14, 2022 · 37585a9 · 37585a9
2 parents 83055cc + e91d7d9
commit 37585a9
Show file tree

Hide file tree

Showing 18 changed files with 425 additions and 155 deletions.
diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch
@@ -114,3 +114,29 @@ index d0e3f94..76774b0 100644
  /**
   * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
   * implementation. This version allows using different token sequences for callables
+diff --git a/thrust/iterator/transform_input_output_iterator.h b/thrust/iterator/transform_input_output_iterator.h
+index f512a36..a5f725d 100644
+--- a/thrust/iterator/transform_input_output_iterator.h
++++ b/thrust/iterator/transform_input_output_iterator.h
+@@ -102,6 +102,8 @@ template <typename InputFunction, typename OutputFunction, typename Iterator>
+   /*! \endcond
+    */
+
++  transform_input_output_iterator() = default;
++
+   /*! This constructor takes as argument a \c Iterator an \c InputFunction and an
+    * \c OutputFunction and copies them to a new \p transform_input_output_iterator
+    *
+diff --git a/thrust/iterator/transform_output_iterator.h b/thrust/iterator/transform_output_iterator.h
+index 66fb46a..4a68cb5 100644
+--- a/thrust/iterator/transform_output_iterator.h
++++ b/thrust/iterator/transform_output_iterator.h
+@@ -104,6 +104,8 @@ template <typename UnaryFunction, typename OutputIterator>
+   /*! \endcond
+    */
+
++  transform_output_iterator() = default;
++
+   /*! This constructor takes as argument an \c OutputIterator and an \c
+    * UnaryFunction and copies them to a new \p transform_output_iterator
+    *
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
@@ -207,9 +207,31 @@ std::unique_ptr<column> rank(
 /**
  * @brief Returns sorted order after sorting each segment in the table.
  *
- * If segment_offsets contains values larger than number of rows, behavior is undefined.
+ * If segment_offsets contains values larger than the number of rows, the behavior is undefined.
  * @throws cudf::logic_error if `segment_offsets` is not `size_type` column.
  *
+ * @code{.pseudo}
+ * Example:
+ * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} }
+ * offsets = {0, 3, 7, 10}
+ * result = cudf::segmented_sorted_order(keys, offsets);
+ * result is { 2,1,0, 6,5,4,3, 9,8,7 }
+ * @endcode
+ *
+ * If segment_offsets is empty or contains a single index, no values are sorted
+ * and the result is a sequence of integers from 0 to keys.size()-1.
+ *
+ * The segment_offsets are not required to include all indices. Any indices
+ * outside the specified segments will not be sorted.
+ *
+ * @code{.pseudo}
+ * Example: (offsets do not cover all indices)
+ * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} }
+ * offsets = {3, 7}
+ * result = cudf::segmented_sorted_order(keys, offsets);
+ * result is { 0,1,2, 6,5,4,3, 7,8,9 }
+ * @endcode
+ *
  * @param keys The table that determines the ordering of elements in each segment
  * @param segment_offsets The column of `size_type` type containing start offset index for each
  * contiguous segment.
@@ -246,10 +268,34 @@ std::unique_ptr<column> stable_segmented_sorted_order(
 /**
  * @brief Performs a lexicographic segmented sort of a table
  *
- * If segment_offsets contains values larger than number of rows, behavior is undefined.
+ * If segment_offsets contains values larger than the number of rows, the behavior is undefined.
  * @throws cudf::logic_error if `values.num_rows() != keys.num_rows()`.
  * @throws cudf::logic_error if `segment_offsets` is not `size_type` column.
  *
+ * @code{.pseudo}
+ * Example:
+ * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} }
+ * values = { {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'} }
+ * offsets = {0, 3, 7, 10}
+ * result = cudf::segmented_sort_by_key(keys, values, offsets);
+ * result is { 'c','b','a', 'g','f','e','d', 'j','i','h' }
+ * @endcode
+ *
+ * If segment_offsets is empty or contains a single index, no values are sorted
+ * and the result is a copy of the values.
+ *
+ * The segment_offsets are not required to include all indices. Any indices
+ * outside the specified segments will not be sorted.
+ *
+ * @code{.pseudo}
+ * Example: (offsets do not cover all indices)
+ * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} }
+ * values = { {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'} }
+ * offsets = {3, 7}
+ * result = cudf::segmented_sort_by_key(keys, values, offsets);
+ * result is { 'a','b','c', 'g','f','e','d', 'h','i','j' }
+ * @endcode
+ *
  * @param values The table to reorder
  * @param keys The table that determines the ordering of elements in each segment
  * @param segment_offsets The column of `size_type` type containing start offset index for each

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
@@ -403,7 +403,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     std::string name   = "";
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
-      name = "element";
+      name = list_child_name;
     } else if (column_categories[parent_col_id] == NC_FN) {
       auto field_name_col_id = parent_col_id;
       parent_col_id          = column_parent_ids[parent_col_id];
@@ -689,19 +689,24 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       size_type num_rows = json_col.child_offsets.size() - 1;
       std::vector<column_name_info> column_names{};
       column_names.emplace_back("offsets");
-      column_names.emplace_back(json_col.child_columns.begin()->first);
+      column_names.emplace_back(
+        json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first);
 
       // Note: json_col modified here, reuse the memory
       auto offsets_column = std::make_unique<column>(
         data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release());
       // Create children column
       auto [child_column, names] =
-        device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
-                                          d_input,
-                                          options,
-                                          get_child_schema(json_col.child_columns.begin()->first),
-                                          stream,
-                                          mr);
+        json_col.child_columns.empty()
+          ? std::pair<std::unique_ptr<column>,
+                      std::vector<column_name_info>>{std::make_unique<column>(), {}}
+          : device_json_column_to_cudf_column(
+              json_col.child_columns.begin()->second,
+              d_input,
+              options,
+              get_child_schema(json_col.child_columns.begin()->first),
+              stream,
+              mr);
       column_names.back().children      = names;
       auto [result_bitmask, null_count] = make_validity(json_col);
       return {make_lists_column(num_rows,