Merge branch 'branch-21.12' of https://github.com/rapidsai/cudf into …

…bug-orc-stream-overlap
rapidsai · Oct 30, 2021 · a8986ed · a8986ed
2 parents af2aa58 + 77c6f1d
commit a8986ed
Show file tree

Hide file tree

Showing 35 changed files with 1,133 additions and 642 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -70,6 +70,24 @@ repos:
                 language: system
                 files: \.(cu|cuh|h|hpp|cpp|inl)$
                 args: ['-fallback-style=none']
+              - id: cmake-format
+                name: cmake-format
+                entry: bash cpp/scripts/run-cmake-format.sh cmake-format
+                language: python
+                types: [cmake]
+                # Note that pre-commit autoupdate does not update the versions
+                # of dependencies, so we'll have to update this manually.
+                additional_dependencies:
+                  - cmakelang==0.6.13
+              - id: cmake-lint
+                name: cmake-lint
+                entry: bash cpp/scripts/run-cmake-format.sh cmake-lint
+                language: python
+                types: [cmake]
+                # Note that pre-commit autoupdate does not update the versions
+                # of dependencies, so we'll have to update this manually.
+                additional_dependencies:
+                  - cmakelang==0.6.13
 
 default_language_version:
       python: python3
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -270,6 +270,7 @@ add_library(cudf
     src/io/functions.cpp
     src/io/json/json_gpu.cu
     src/io/json/reader_impl.cu
+    src/io/orc/aggregate_orc_metadata.cpp
     src/io/orc/dict_enc.cu
     src/io/orc/orc.cpp
     src/io/orc/reader_impl.cu

diff --git a/cpp/benchmarks/join/conditional_join_benchmark.cu b/cpp/benchmarks/join/conditional_join_benchmark.cu
@@ -148,27 +148,31 @@ BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit)
   ->Unit(benchmark::kMillisecond)
   ->Args({100'000, 100'000})
   ->Args({100'000, 400'000})
+  ->Args({400'000, 100'000})
   ->Args({100'000, 1'000'000})
   ->UseManualTime();
 
 BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit)
   ->Unit(benchmark::kMillisecond)
   ->Args({100'000, 100'000})
   ->Args({100'000, 400'000})
+  ->Args({400'000, 100'000})
   ->Args({100'000, 1'000'000})
   ->UseManualTime();
 
 BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit_nulls)
   ->Unit(benchmark::kMillisecond)
   ->Args({100'000, 100'000})
   ->Args({100'000, 400'000})
+  ->Args({400'000, 100'000})
   ->Args({100'000, 1'000'000})
   ->UseManualTime();
 
 BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit_nulls)
   ->Unit(benchmark::kMillisecond)
   ->Args({100'000, 100'000})
   ->Args({100'000, 400'000})
+  ->Args({400'000, 100'000})
   ->Args({100'000, 1'000'000})
   ->UseManualTime();
 

diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md
@@ -1,6 +1,6 @@
 # Regex Features
 
-This page specifies which regex features are currently supported by libcudf strings column APIs that accept regex patterns:
+This page specifies which regular expression (regex) features are currently supported by libcudf strings column APIs that accept regex patterns:
 
 - cudf::strings::contains_re()
 - cudf::strings::matches_re()
@@ -14,6 +14,13 @@ The details are based on features documented at https://www.regular-expressions.
 
 **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen.
 
+**Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following:
+- Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals.
+- Unmatched paired special characters like `()`, `[]`, and `{}`.
+- Empty groups, classes, or quantifiers. That is, `()` and `[]` without an enclosing expression and `{}` without a valid integer.
+- Incomplete ranges in character classes like `[-z]`, `[a-]`, and `[-]`.
+- Unqualified quantifiers. That is, a quantifier with no preceding item to match like `*a`, `a⎮?`, `(+)`, `{2}a`, etc.
+
 ## Features Supported
 
 ### Characters

diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# This script is a pre-commit hook that wraps cmakelang's cmake linters. The
+# wrapping is necessary because RAPIDS libraries split configuration for
+# cmakelang linters between a local config file and a second config file that's
+# shared across all of RAPIDS via rapids-cmake. In order to keep it up to date
+# this file is only maintained in one place (the rapids-cmake repo) and
+# pulled down during builds. We need a way to invoke CMake linting commands
+# without causing pre-commit failures (which could block local commits or CI),
+# while also being sufficiently flexible to allow users to maintain the config
+# file independently of a build directory.
+#
+# This script provides the minimal functionality to enable those use cases. It
+# searches in a number of predefined locations for the rapids-cmake config file
+# and exits gracefully if the file is not found. If a user wishes to specify a
+# config file at a nonstandard location, they may do so by setting the
+# environment variable RAPIDS_CMAKE_FORMAT_FILE.
+# 
+# While this script can be invoked directly (but only from the repo root since
+# all paths are relative to that), it is advisable to instead use the
+# pre-commit hooks via
+# `pre-commit run (cmake-format)|(cmake-format)`.
+#
+# Usage:
+# bash run-cmake-format.sh {cmake-format,cmake-lint} infile [infile ...]
+
+# Note that pre-commit always runs from the root of the repository, so relative
+# paths are automatically relative to the repo root.
+DEFAULT_FORMAT_FILE_LOCATIONS=(
+  "cpp/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json" 
+  "${CUDF_ROOT:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
+  "cpp/libcudf_kafka/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
+)
+
+if [ -z ${RAPIDS_CMAKE_FORMAT_FILE:+PLACEHOLDER} ]; then
+    for file_path in ${DEFAULT_FORMAT_FILE_LOCATIONS[@]}; do
+        if [ -f ${file_path} ]; then
+            RAPIDS_CMAKE_FORMAT_FILE=${file_path}
+            break
+        fi
+    done
+fi
+
+if [ -z ${RAPIDS_CMAKE_FORMAT_FILE:+PLACEHOLDER} ]; then
+  echo "The rapids-cmake cmake-format configuration file was not found at any of the default search locations: "
+  echo ""
+  ( IFS=$'\n'; echo "${DEFAULT_FORMAT_FILE_LOCATIONS[*]}" )
+  echo ""
+  echo "Try setting the environment variable RAPIDS_CMAKE_FORMAT_FILE to the path to the config file."
+  exit 0
+else
+  echo "Using format file ${RAPIDS_CMAKE_FORMAT_FILE}"
+fi
+
+if [[ $1 == "cmake-format" ]]; then
+  cmake-format -i --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2}
+elif [[ $1 == "cmake-lint" ]]; then
+  cmake-lint --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2}
+fi
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -27,6 +27,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/functional.h>
@@ -43,18 +44,19 @@ struct scan_functor {
     if (K == aggregation::SUM)
       return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
     else if (K == aggregation::MIN or K == aggregation::MAX)
-      return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
+      return !cudf::is_dictionary<T>() and is_relationally_comparable<T, T>();
     else
       return false;
   }
 
   template <typename T>
-  std::enable_if_t<is_supported<T>(), std::unique_ptr<column>> operator()(
-    column_view const& values,
-    size_type num_groups,
-    cudf::device_span<cudf::size_type const> group_labels,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+  std::enable_if_t<is_supported<T>() and not std::is_same_v<T, cudf::string_view>,
+                   std::unique_ptr<column>>
+  operator()(column_view const& values,
+             size_type num_groups,
+             cudf::device_span<cudf::size_type const> group_labels,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
   {
     using DeviceType       = device_storage_type_t<T>;
     using OpType           = cudf::detail::corresponding_operator_t<K>;
@@ -102,6 +104,53 @@ struct scan_functor {
     return result;
   }
 
+  template <typename T>
+  std::enable_if_t<is_supported<T>() and std::is_same_v<T, cudf::string_view>,
+                   std::unique_ptr<column>>
+  operator()(column_view const& values,
+             size_type num_groups,
+             cudf::device_span<cudf::size_type const> group_labels,
+             rmm::cuda_stream_view stream,
+             rmm::mr::device_memory_resource* mr)
+  {
+    using OpType = cudf::detail::corresponding_operator_t<K>;
+
+    if (values.is_empty()) {
+      return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+    }
+
+    // create an empty output vector we can fill with string_view instances
+    auto results_vector = rmm::device_uvector<string_view>(values.size(), stream);
+
+    auto values_view = column_device_view::create(values, stream);
+
+    if (values.has_nulls()) {
+      auto input = make_null_replacement_iterator(
+        *values_view, OpType::template identity<string_view>(), values.has_nulls());
+      thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                    group_labels.begin(),
+                                    group_labels.end(),
+                                    input,
+                                    results_vector.begin(),
+                                    thrust::equal_to<size_type>{},
+                                    OpType{});
+    } else {
+      thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                    group_labels.begin(),
+                                    group_labels.end(),
+                                    values_view->begin<string_view>(),
+                                    results_vector.begin(),
+                                    thrust::equal_to<size_type>{},
+                                    OpType{});
+    }
+
+    // turn the string_view vector into a strings column
+    auto results = make_strings_column(results_vector, string_view{}, stream, mr);
+    if (values.has_nulls())
+      results->set_null_mask(cudf::detail::copy_bitmask(values, stream), values.null_count());
+    return results;
+  }
+
   template <typename T, typename... Args>
   std::enable_if_t<not is_supported<T>(), std::unique_ptr<column>> operator()(Args&&... args)
   {

diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
@@ -256,7 +256,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info)
 
   // Get column names
   for (auto i = 0; i < metadata.get_num_columns(); i++) {
-    result.column_names.push_back(metadata.get_column_name(i));
+    result.column_names.push_back(metadata.column_name(i));
   }
 
   // Get file-level statistics, statistics of each column of file