Skip to content

Commit

Permalink
Merge branch 'branch-21.12' of https://github.com/rapidsai/cudf into …
Browse files Browse the repository at this point in the history
…bug-orc-stream-overlap
  • Loading branch information
vuule committed Oct 30, 2021
2 parents af2aa58 + 77c6f1d commit a8986ed
Show file tree
Hide file tree
Showing 35 changed files with 1,133 additions and 642 deletions.
18 changes: 18 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,24 @@ repos:
language: system
files: \.(cu|cuh|h|hpp|cpp|inl)$
args: ['-fallback-style=none']
- id: cmake-format
name: cmake-format
entry: bash cpp/scripts/run-cmake-format.sh cmake-format
language: python
types: [cmake]
# Note that pre-commit autoupdate does not update the versions
# of dependencies, so we'll have to update this manually.
additional_dependencies:
- cmakelang==0.6.13
- id: cmake-lint
name: cmake-lint
entry: bash cpp/scripts/run-cmake-format.sh cmake-lint
language: python
types: [cmake]
# Note that pre-commit autoupdate does not update the versions
# of dependencies, so we'll have to update this manually.
additional_dependencies:
- cmakelang==0.6.13

default_language_version:
python: python3
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ add_library(cudf
src/io/functions.cpp
src/io/json/json_gpu.cu
src/io/json/reader_impl.cu
src/io/orc/aggregate_orc_metadata.cpp
src/io/orc/dict_enc.cu
src/io/orc/orc.cpp
src/io/orc/reader_impl.cu
Expand Down
4 changes: 4 additions & 0 deletions cpp/benchmarks/join/conditional_join_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -148,27 +148,31 @@ BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit)
->Unit(benchmark::kMillisecond)
->Args({100'000, 100'000})
->Args({100'000, 400'000})
->Args({400'000, 100'000})
->Args({100'000, 1'000'000})
->UseManualTime();

BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit)
->Unit(benchmark::kMillisecond)
->Args({100'000, 100'000})
->Args({100'000, 400'000})
->Args({400'000, 100'000})
->Args({100'000, 1'000'000})
->UseManualTime();

BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit_nulls)
->Unit(benchmark::kMillisecond)
->Args({100'000, 100'000})
->Args({100'000, 400'000})
->Args({400'000, 100'000})
->Args({100'000, 1'000'000})
->UseManualTime();

BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit_nulls)
->Unit(benchmark::kMillisecond)
->Args({100'000, 100'000})
->Args({100'000, 400'000})
->Args({400'000, 100'000})
->Args({100'000, 1'000'000})
->UseManualTime();

Expand Down
9 changes: 8 additions & 1 deletion cpp/doxygen/regex.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Regex Features

This page specifies which regex features are currently supported by libcudf strings column APIs that accept regex patterns:
This page specifies which regular expression (regex) features are currently supported by libcudf strings column APIs that accept regex patterns:

- cudf::strings::contains_re()
- cudf::strings::matches_re()
Expand All @@ -14,6 +14,13 @@ The details are based on features documented at https://www.regular-expressions.

**Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen.

**Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following:
- Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals.
- Unmatched paired special characters like `()`, `[]`, and `{}`.
- Empty groups, classes, or quantifiers. That is, `()` and `[]` without an enclosing expression and `{}` without a valid integer.
- Incomplete ranges in character classes like `[-z]`, `[a-]`, and `[-]`.
- Unqualified quantifiers. That is, a quantifier with no preceding item to match like `*a`, `a⎮?`, `(+)`, `{2}a`, etc.

## Features Supported

### Characters
Expand Down
59 changes: 59 additions & 0 deletions cpp/scripts/run-cmake-format.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/bin/bash

# This script is a pre-commit hook that wraps cmakelang's cmake linters. The
# wrapping is necessary because RAPIDS libraries split configuration for
# cmakelang linters between a local config file and a second config file that's
# shared across all of RAPIDS via rapids-cmake. In order to keep it up to date
# this file is only maintained in one place (the rapids-cmake repo) and
# pulled down during builds. We need a way to invoke CMake linting commands
# without causing pre-commit failures (which could block local commits or CI),
# while also being sufficiently flexible to allow users to maintain the config
# file independently of a build directory.
#
# This script provides the minimal functionality to enable those use cases. It
# searches in a number of predefined locations for the rapids-cmake config file
# and exits gracefully if the file is not found. If a user wishes to specify a
# config file at a nonstandard location, they may do so by setting the
# environment variable RAPIDS_CMAKE_FORMAT_FILE.
#
# While this script can be invoked directly (but only from the repo root since
# all paths are relative to that), it is advisable to instead use the
# pre-commit hooks via
# `pre-commit run (cmake-format)|(cmake-format)`.
#
# Usage:
# bash run-cmake-format.sh {cmake-format,cmake-lint} infile [infile ...]

# Note that pre-commit always runs from the root of the repository, so relative
# paths are automatically relative to the repo root.
DEFAULT_FORMAT_FILE_LOCATIONS=(
"cpp/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
"${CUDF_ROOT:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
"cpp/libcudf_kafka/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
)

if [ -z ${RAPIDS_CMAKE_FORMAT_FILE:+PLACEHOLDER} ]; then
for file_path in ${DEFAULT_FORMAT_FILE_LOCATIONS[@]}; do
if [ -f ${file_path} ]; then
RAPIDS_CMAKE_FORMAT_FILE=${file_path}
break
fi
done
fi

if [ -z ${RAPIDS_CMAKE_FORMAT_FILE:+PLACEHOLDER} ]; then
echo "The rapids-cmake cmake-format configuration file was not found at any of the default search locations: "
echo ""
( IFS=$'\n'; echo "${DEFAULT_FORMAT_FILE_LOCATIONS[*]}" )
echo ""
echo "Try setting the environment variable RAPIDS_CMAKE_FORMAT_FILE to the path to the config file."
exit 0
else
echo "Using format file ${RAPIDS_CMAKE_FORMAT_FILE}"
fi

if [[ $1 == "cmake-format" ]]; then
cmake-format -i --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2}
elif [[ $1 == "cmake-lint" ]]; then
cmake-lint --config-files cpp/cmake/config.json ${RAPIDS_CMAKE_FORMAT_FILE} -- ${@:2}
fi
63 changes: 56 additions & 7 deletions cpp/src/groupby/sort/group_scan_util.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <cudf/utilities/span.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/functional.h>
Expand All @@ -43,18 +44,19 @@ struct scan_functor {
if (K == aggregation::SUM)
return cudf::is_numeric<T>() || cudf::is_duration<T>() || cudf::is_fixed_point<T>();
else if (K == aggregation::MIN or K == aggregation::MAX)
return cudf::is_fixed_width<T>() and is_relationally_comparable<T, T>();
return !cudf::is_dictionary<T>() and is_relationally_comparable<T, T>();
else
return false;
}

template <typename T>
std::enable_if_t<is_supported<T>(), std::unique_ptr<column>> operator()(
column_view const& values,
size_type num_groups,
cudf::device_span<cudf::size_type const> group_labels,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
std::enable_if_t<is_supported<T>() and not std::is_same_v<T, cudf::string_view>,
std::unique_ptr<column>>
operator()(column_view const& values,
size_type num_groups,
cudf::device_span<cudf::size_type const> group_labels,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
using DeviceType = device_storage_type_t<T>;
using OpType = cudf::detail::corresponding_operator_t<K>;
Expand Down Expand Up @@ -102,6 +104,53 @@ struct scan_functor {
return result;
}

template <typename T>
std::enable_if_t<is_supported<T>() and std::is_same_v<T, cudf::string_view>,
std::unique_ptr<column>>
operator()(column_view const& values,
size_type num_groups,
cudf::device_span<cudf::size_type const> group_labels,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
using OpType = cudf::detail::corresponding_operator_t<K>;

if (values.is_empty()) {
return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
}

// create an empty output vector we can fill with string_view instances
auto results_vector = rmm::device_uvector<string_view>(values.size(), stream);

auto values_view = column_device_view::create(values, stream);

if (values.has_nulls()) {
auto input = make_null_replacement_iterator(
*values_view, OpType::template identity<string_view>(), values.has_nulls());
thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
input,
results_vector.begin(),
thrust::equal_to<size_type>{},
OpType{});
} else {
thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
group_labels.begin(),
group_labels.end(),
values_view->begin<string_view>(),
results_vector.begin(),
thrust::equal_to<size_type>{},
OpType{});
}

// turn the string_view vector into a strings column
auto results = make_strings_column(results_vector, string_view{}, stream, mr);
if (values.has_nulls())
results->set_null_mask(cudf::detail::copy_bitmask(values, stream), values.null_count());
return results;
}

template <typename T, typename... Args>
std::enable_if_t<not is_supported<T>(), std::unique_ptr<column>> operator()(Args&&... args)
{
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/io/functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info)

// Get column names
for (auto i = 0; i < metadata.get_num_columns(); i++) {
result.column_names.push_back(metadata.get_column_name(i));
result.column_names.push_back(metadata.column_name(i));
}

// Get file-level statistics, statistics of each column of file
Expand Down
Loading

0 comments on commit a8986ed

Please sign in to comment.