Merge branch 'branch-22.06' into distinct_tests

# Conflicts: # cpp/tests/stream_compaction/distinct_tests.cpp
rapidsai · May 16, 2022 · e007814 · e007814
2 parents ff6d063 + d0d7193
commit e007814
Show file tree

Hide file tree

Showing 16 changed files with 304 additions and 80 deletions.
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
@@ -37,7 +37,7 @@
     re.compile(r"[.]flake8[.]cython$"),
     re.compile(r"meta[.]yaml$")
 ]
-ExemptFiles = []
+ExemptFiles = ["cpp/include/cudf_test/cxxopts.hpp"]
 
 # this will break starting at year 10000, which is probably OK :)
 CheckSimple = re.compile(
@@ -230,4 +230,4 @@ def checkCopyright_main():
 
 if __name__ == "__main__":
     import sys
-    sys.exit(checkCopyright_main())
+    sys.exit(checkCopyright_main())
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
@@ -89,6 +89,7 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/hashing.hpp
         - test -f $PREFIX/include/cudf/detail/interop.hpp
         - test -f $PREFIX/include/cudf/detail/is_element_valid.hpp
+        - test -f $PREFIX/include/cudf/detail/join.hpp
         - test -f $PREFIX/include/cudf/detail/null_mask.hpp
         - test -f $PREFIX/include/cudf/detail/nvtx/nvtx3.hpp
         - test -f $PREFIX/include/cudf/detail/nvtx/ranges.hpp
@@ -168,6 +169,7 @@ outputs:
         - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp
         - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
         - test -f $PREFIX/include/cudf/lists/detail/scatter_helper.cuh
+        - test -f $PREFIX/include/cudf/lists/detail/stream_compaction.hpp
         - test -f $PREFIX/include/cudf/lists/combine.hpp
         - test -f $PREFIX/include/cudf/lists/count_elements.hpp
         - test -f $PREFIX/include/cudf/lists/explode.hpp
@@ -178,6 +180,7 @@ outputs:
         - test -f $PREFIX/include/cudf/lists/gather.hpp
         - test -f $PREFIX/include/cudf/lists/lists_column_view.hpp
         - test -f $PREFIX/include/cudf/lists/sorting.hpp
+        - test -f $PREFIX/include/cudf/lists/stream_compaction.hpp
         - test -f $PREFIX/include/cudf/merge.hpp
         - test -f $PREFIX/include/cudf/null_mask.hpp
         - test -f $PREFIX/include/cudf/partitioning.hpp

diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
@@ -892,7 +892,9 @@ EXCLUDE_PATTERNS       = */nvtx/* */detail/*
 # Note that the wildcards are matched against the file with absolute path, so to
 # exclude all test directories use the pattern */test/*
 
-EXCLUDE_SYMBOLS        = org::apache
+EXCLUDE_SYMBOLS        = org::apache \
+                         *_impl \
+                         *Impl
 
 # The EXAMPLE_PATH tag can be used to specify one or more files or directories
 # that contain example code fragments that are included (see the \include
@@ -2130,7 +2132,8 @@ INCLUDE_FILE_PATTERNS  =
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
 PREDEFINED              = __device__= \
-                          __host__=
+                          __host__= \
+                          DOXYGEN_SHOULD_SKIP_THIS
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
@@ -1198,7 +1198,7 @@ struct optional_accessor {
   /**
    * @brief Constructor
    *
-   * @param col Column on which to iterator over its elements.
+   * @param _col Column on which to iterator over its elements.
    * @param with_nulls Indicates if the `col` should be checked for nulls.
    */
   optional_accessor(column_device_view const& _col, Nullate with_nulls)

diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -420,7 +420,7 @@ class self_comparator {
    * @brief Construct an owning object for performing a lexicographic comparison between two rows of
    * the same table.
    *
-   * @param table The table to compare
+   * @param t The table to compare
    * @param column_order Optional, host array the same length as a row that indicates the desired
    * ascending/descending order of each column in a row. If empty, it is assumed all columns are
    * sorted in ascending order.

diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -156,41 +156,38 @@ constexpr bool is_fixed_point(cudf::type_id id)
   template <>                                         \
   struct id_to_type_impl<Id> {                        \
     using type = Type;                                \
-  }
+  };
 #endif
 
-/**
- * @brief Defines all of the mappings between C++ types and their corresponding
- * `cudf::type_id` values.
- */
-CUDF_TYPE_MAPPING(bool, type_id::BOOL8);
-CUDF_TYPE_MAPPING(int8_t, type_id::INT8);
-CUDF_TYPE_MAPPING(int16_t, type_id::INT16);
-CUDF_TYPE_MAPPING(int32_t, type_id::INT32);
-CUDF_TYPE_MAPPING(int64_t, type_id::INT64);
-CUDF_TYPE_MAPPING(uint8_t, type_id::UINT8);
-CUDF_TYPE_MAPPING(uint16_t, type_id::UINT16);
-CUDF_TYPE_MAPPING(uint32_t, type_id::UINT32);
-CUDF_TYPE_MAPPING(uint64_t, type_id::UINT64);
-CUDF_TYPE_MAPPING(float, type_id::FLOAT32);
-CUDF_TYPE_MAPPING(double, type_id::FLOAT64);
-CUDF_TYPE_MAPPING(cudf::string_view, type_id::STRING);
-CUDF_TYPE_MAPPING(cudf::timestamp_D, type_id::TIMESTAMP_DAYS);
-CUDF_TYPE_MAPPING(cudf::timestamp_s, type_id::TIMESTAMP_SECONDS);
-CUDF_TYPE_MAPPING(cudf::timestamp_ms, type_id::TIMESTAMP_MILLISECONDS);
-CUDF_TYPE_MAPPING(cudf::timestamp_us, type_id::TIMESTAMP_MICROSECONDS);
-CUDF_TYPE_MAPPING(cudf::timestamp_ns, type_id::TIMESTAMP_NANOSECONDS);
-CUDF_TYPE_MAPPING(cudf::duration_D, type_id::DURATION_DAYS);
-CUDF_TYPE_MAPPING(cudf::duration_s, type_id::DURATION_SECONDS);
-CUDF_TYPE_MAPPING(cudf::duration_ms, type_id::DURATION_MILLISECONDS);
-CUDF_TYPE_MAPPING(cudf::duration_us, type_id::DURATION_MICROSECONDS);
-CUDF_TYPE_MAPPING(cudf::duration_ns, type_id::DURATION_NANOSECONDS);
-CUDF_TYPE_MAPPING(dictionary32, type_id::DICTIONARY32);
-CUDF_TYPE_MAPPING(cudf::list_view, type_id::LIST);
-CUDF_TYPE_MAPPING(numeric::decimal32, type_id::DECIMAL32);
-CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64);
-CUDF_TYPE_MAPPING(numeric::decimal128, type_id::DECIMAL128);
-CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT);
+// Defines all of the mappings between C++ types and their corresponding `cudf::type_id` values.
+CUDF_TYPE_MAPPING(bool, type_id::BOOL8)
+CUDF_TYPE_MAPPING(int8_t, type_id::INT8)
+CUDF_TYPE_MAPPING(int16_t, type_id::INT16)
+CUDF_TYPE_MAPPING(int32_t, type_id::INT32)
+CUDF_TYPE_MAPPING(int64_t, type_id::INT64)
+CUDF_TYPE_MAPPING(uint8_t, type_id::UINT8)
+CUDF_TYPE_MAPPING(uint16_t, type_id::UINT16)
+CUDF_TYPE_MAPPING(uint32_t, type_id::UINT32)
+CUDF_TYPE_MAPPING(uint64_t, type_id::UINT64)
+CUDF_TYPE_MAPPING(float, type_id::FLOAT32)
+CUDF_TYPE_MAPPING(double, type_id::FLOAT64)
+CUDF_TYPE_MAPPING(cudf::string_view, type_id::STRING)
+CUDF_TYPE_MAPPING(cudf::timestamp_D, type_id::TIMESTAMP_DAYS)
+CUDF_TYPE_MAPPING(cudf::timestamp_s, type_id::TIMESTAMP_SECONDS)
+CUDF_TYPE_MAPPING(cudf::timestamp_ms, type_id::TIMESTAMP_MILLISECONDS)
+CUDF_TYPE_MAPPING(cudf::timestamp_us, type_id::TIMESTAMP_MICROSECONDS)
+CUDF_TYPE_MAPPING(cudf::timestamp_ns, type_id::TIMESTAMP_NANOSECONDS)
+CUDF_TYPE_MAPPING(cudf::duration_D, type_id::DURATION_DAYS)
+CUDF_TYPE_MAPPING(cudf::duration_s, type_id::DURATION_SECONDS)
+CUDF_TYPE_MAPPING(cudf::duration_ms, type_id::DURATION_MILLISECONDS)
+CUDF_TYPE_MAPPING(cudf::duration_us, type_id::DURATION_MICROSECONDS)
+CUDF_TYPE_MAPPING(cudf::duration_ns, type_id::DURATION_NANOSECONDS)
+CUDF_TYPE_MAPPING(dictionary32, type_id::DICTIONARY32)
+CUDF_TYPE_MAPPING(cudf::list_view, type_id::LIST)
+CUDF_TYPE_MAPPING(numeric::decimal32, type_id::DECIMAL32)
+CUDF_TYPE_MAPPING(numeric::decimal64, type_id::DECIMAL64)
+CUDF_TYPE_MAPPING(numeric::decimal128, type_id::DECIMAL128)
+CUDF_TYPE_MAPPING(cudf::struct_view, type_id::STRUCT)
 
 /**
  * @brief Use this specialization on `type_dispatcher` whenever you only need to operate on the
@@ -210,6 +207,12 @@ struct type_to_scalar_type_impl {
   using ScalarType = cudf::scalar;
 };
 
+/**
+ * @brief Macro used to define scalar type and scalar device type for
+ * `cudf::numeric_scalar` template class for numeric C++ types.
+ *
+ * @param Type The numeric C++ type
+ */
 #ifndef MAP_NUMERIC_SCALAR
 #define MAP_NUMERIC_SCALAR(Type)                                     \
   template <>                                                        \
@@ -230,7 +233,7 @@ MAP_NUMERIC_SCALAR(uint32_t)
 MAP_NUMERIC_SCALAR(uint64_t)
 MAP_NUMERIC_SCALAR(float)
 MAP_NUMERIC_SCALAR(double)
-MAP_NUMERIC_SCALAR(bool);
+MAP_NUMERIC_SCALAR(bool)
 
 template <>
 struct type_to_scalar_type_impl<std::string> {
@@ -281,6 +284,12 @@ struct type_to_scalar_type_impl<cudf::struct_view> {
   // using ScalarDeviceType = cudf::struct_scalar_device_view; // CALEB: TODO!
 };
 
+/**
+ * @brief Macro used to define scalar type and scalar device type for
+ * `cudf::timestamp_scalar` template class for timestamp C++ types.
+ *
+ * @param Type The timestamp C++ type
+ */
 #ifndef MAP_TIMESTAMP_SCALAR
 #define MAP_TIMESTAMP_SCALAR(Type)                                     \
   template <>                                                          \
@@ -296,6 +305,12 @@ MAP_TIMESTAMP_SCALAR(timestamp_ms)
 MAP_TIMESTAMP_SCALAR(timestamp_us)
 MAP_TIMESTAMP_SCALAR(timestamp_ns)
 
+/**
+ * @brief Macro used to define scalar type and scalar device type for
+ * `cudf::duration_scalar` template class for duration C++ types.
+ *
+ * @param Type The duration C++ type
+ */
 #ifndef MAP_DURATION_SCALAR
 #define MAP_DURATION_SCALAR(Type)                                     \
   template <>                                                         \

diff --git a/cpp/include/cudf_test/cxxopts.hpp b/cpp/include/cudf_test/cxxopts.hpp
@@ -20,6 +20,8 @@ THE SOFTWARE.
 #ifndef CXXOPTS_HPP_INCLUDED
 #define CXXOPTS_HPP_INCLUDED
 
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+
 #include <cctype>
 #include <cstring>
 #include <exception>
@@ -1498,4 +1500,5 @@ inline const HelpGroupDetails& Options::group_help(const std::string& group) con
 
 }  // namespace cxxopts
 
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
 #endif  // CXXOPTS_HPP_INCLUDED
diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -107,7 +107,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(
  * @throw cudf::logic_error if `separator` is invalid
  *
  * @param input Strings to encode.
- * @param merge_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
+ * @param merges_pairs Created by a call to @ref nvtext::load_merge_pairs_file.
  * @param separator String used to build the output after encoding.
  *                  Default is a space.
  * @param mr Memory resource to allocate any returned objects.

diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
@@ -148,20 +148,33 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
     CUDF_EXPECTS(tensor.device.device_id == device_id, "DLTensor device ID must be current device");
   }
 
-  // Currently only 1D and 2D tensors are supported
-  CUDF_EXPECTS(tensor.ndim > 0 && tensor.ndim <= 2, "DLTensor must be 1D or 2D");
-
+  // We only support 1D and 2D tensors with some restrictions on layout
+  if (tensor.ndim == 1) {
+    // 1D tensors must have dense layout (strides == nullptr <=> dense row-major)
+    CUDF_EXPECTS(nullptr == tensor.strides || tensor.strides[0] == 1,
+                 "from_dlpack of 1D DLTensor only for unit-stride data");
+  } else if (tensor.ndim == 2) {
+    // 2D tensors must have column-major layout and the fastest dimension must have dense layout
+    CUDF_EXPECTS((
+                   // 1D tensor reshaped into (N, 1) is fine
+                   tensor.shape[1] == 1 && (nullptr == tensor.strides || tensor.strides[0] == 1))
+                   // General case
+                   || (nullptr != tensor.strides && tensor.strides[0] == 1 &&
+                       tensor.strides[1] >= tensor.shape[0]),
+                 "from_dlpack of 2D DLTensor only for column-major unit-stride data");
+  } else {
+    CUDF_FAIL("DLTensor must be 1D or 2D");
+  }
   CUDF_EXPECTS(tensor.shape[0] >= 0,
-               "DLTensor first dim should be of shape greater than or equal-to 0.");
+               "DLTensor first dim should be of shape greater than or equal to 0.");
   CUDF_EXPECTS(tensor.shape[0] < std::numeric_limits<size_type>::max(),
                "DLTensor first dim exceeds size supported by cudf");
   if (tensor.ndim > 1) {
     CUDF_EXPECTS(tensor.shape[1] >= 0,
-                 "DLTensor second dim should be of shape greater than or equal-to 0.");
+                 "DLTensor second dim should be of shape greater than or equal to 0.");
     CUDF_EXPECTS(tensor.shape[1] < std::numeric_limits<size_type>::max(),
                  "DLTensor second dim exceeds size supported by cudf");
   }
-
   size_t const num_columns = (tensor.ndim == 2) ? static_cast<size_t>(tensor.shape[1]) : 1;
 
   // Validate and convert data type to cudf

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
@@ -43,6 +43,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <thrust/host_vector.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
 #include <iostream>
@@ -696,37 +697,62 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
     column_flags.resize(num_actual_columns, column_parse::enabled | column_parse::inferred);
 
+    std::vector<size_t> col_loop_order(column_names.size());
+    auto unnamed_it = std::copy_if(
+      thrust::make_counting_iterator<size_t>(0),
+      thrust::make_counting_iterator<size_t>(column_names.size()),
+      col_loop_order.begin(),
+      [&column_names](auto col_idx) -> bool { return not column_names[col_idx].empty(); });
     // Rename empty column names to "Unnamed: col_index"
-    for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) {
-      if (column_names[col_idx].empty()) {
-        column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx);
-      }
-    }
+    std::copy_if(thrust::make_counting_iterator<size_t>(0),
+                 thrust::make_counting_iterator<size_t>(column_names.size()),
+                 unnamed_it,
+                 [&column_names](auto col_idx) -> bool {
+                   auto is_empty = column_names[col_idx].empty();
+                   if (is_empty)
+                     column_names[col_idx] = string("Unnamed: ") + std::to_string(col_idx);
+                   return is_empty;
+                 });
 
     // Looking for duplicates
-    std::unordered_map<string, int> col_names_histogram;
-    for (auto& col_name : column_names) {
-      // Operator [] inserts a default-initialized value if the given key is not
-      // present
-      if (++col_names_histogram[col_name] > 1) {
-        if (reader_opts.is_enabled_mangle_dupe_cols()) {
-          // Rename duplicates of column X as X.1, X.2, ...; First appearance
-          // stays as X
-          do {
-            col_name += "." + std::to_string(col_names_histogram[col_name] - 1);
-          } while (col_names_histogram[col_name]++);
-        } else {
+    std::unordered_map<string, int> col_names_counts;
+    if (!reader_opts.is_enabled_mangle_dupe_cols()) {
+      for (auto& col_name : column_names) {
+        if (++col_names_counts[col_name] > 1) {
           // All duplicate columns will be ignored; First appearance is parsed
           const auto idx    = &col_name - column_names.data();
           column_flags[idx] = column_parse::disabled;
         }
       }
+    } else {
+      // For constant/linear search.
+      std::unordered_multiset<std::string> header(column_names.begin(), column_names.end());
+      for (auto const col_idx : col_loop_order) {
+        auto col       = column_names[col_idx];
+        auto cur_count = col_names_counts[col];
+        if (cur_count > 0) {
+          auto const old_col = col;
+          // Rename duplicates of column X as X.1, X.2, ...; First appearance stays as X
+          while (cur_count > 0) {
+            col_names_counts[old_col] = cur_count + 1;
+            col                       = old_col + "." + std::to_string(cur_count);
+            if (header.find(col) != header.end()) {
+              cur_count++;
+            } else {
+              cur_count = col_names_counts[col];
+            }
+          }
+          if (auto pos = header.find(old_col); pos != header.end()) { header.erase(pos); }
+          header.insert(col);
+          column_names[col_idx] = col;
+        }
+        col_names_counts[col] = cur_count + 1;
+      }
     }
 
-    // Update the number of columns to be processed, if some might have been
-    // removed
+    // Update the number of columns to be processed, if some might have been removed
     if (!reader_opts.is_enabled_mangle_dupe_cols()) {
-      num_active_columns = col_names_histogram.size();
+      num_active_columns = col_names_counts.size();
     }
   }
 

diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
@@ -59,7 +59,7 @@ std::unique_ptr<table> distinct(table_view const& input,
   auto keys_view = input.select(keys);
   auto preprocessed_keys =
     cudf::experimental::row::hash::preprocessed_table::create(keys_view, stream);
-  auto has_null = nullate::DYNAMIC{cudf::has_nulls(keys_view)};
+  auto const has_null = nullate::DYNAMIC{cudf::has_nested_nulls(keys_view)};
   auto const num_rows{keys_view.num_rows()};
 
   hash_map_type key_map{compute_hash_table_size(num_rows),