Skip to content

Commit

Permalink
Merge branch 'branch-22.06' into regex-classes-cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Apr 11, 2022
2 parents 5261da9 + df6bd3c commit 3ee2f7b
Show file tree
Hide file tree
Showing 17 changed files with 523 additions and 150 deletions.
260 changes: 258 additions & 2 deletions CHANGELOG.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion cpp/benchmarks/sort/sort_structs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,5 +80,5 @@ void nvbench_sort_struct(nvbench::state& state)
NVBENCH_BENCH(nvbench_sort_struct)
.set_name("sort_struct")
.add_int64_power_of_two_axis("NumRows", {10, 18, 26})
.add_int64_axis("Depth", {1, 8})
.add_int64_axis("Depth", {0, 1, 8})
.add_int64_axis("Nulls", {0, 1});
6 changes: 3 additions & 3 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -44,8 +44,8 @@ class json_reader_options_builder;
/**
* @brief Input arguments to the `read_json` interface.
*
* Available parameters and are closely patterned after PANDAS' `read_json` API.
* Not all parameters are unsupported. If the matching PANDAS' parameter
* Available parameters are closely patterned after PANDAS' `read_json` API.
* Not all parameters are supported. If the matching PANDAS' parameter
* has a default value of `None`, then a default value of `-1` or `0` may be
* used as the equivalent.
*
Expand Down
6 changes: 5 additions & 1 deletion cpp/include/cudf/table/experimental/row_operators.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,11 @@ class device_row_comparator {
return cuda::std::make_pair(state, depth);
}

// Structs have been modified to only have 1 child when using this.
if (lcol.num_child_columns() == 0) {
return cuda::std::make_pair(weak_ordering::EQUIVALENT, depth);
}

// Non-empty structs have been modified to only have 1 child when using this.
lcol = lcol.children()[0];
rcol = rcol.children()[0];
++depth;
Expand Down
93 changes: 21 additions & 72 deletions cpp/src/strings/contains.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/

#include <strings/count_matches.hpp>
#include <strings/regex/dispatcher.hpp>
#include <strings/regex/regex.cuh>
#include <strings/utilities.hpp>
Expand Down Expand Up @@ -114,6 +115,26 @@ std::unique_ptr<column> matches_re(
return regex_dispatcher(*d_prog, contains_dispatch_fn{*d_prog, true}, input, stream, mr);
}

std::unique_ptr<column> count_re(strings_column_view const& input,
std::string const& pattern,
regex_flags const flags,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// compile regex into device object
auto d_prog =
reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);

auto const d_strings = column_device_view::create(input.parent(), stream);

auto result = count_matches(*d_strings, *d_prog, input.size(), stream, mr);
if (input.has_nulls()) {
result->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count());
}
return result;
}

} // namespace detail

// external APIs
Expand All @@ -136,78 +157,6 @@ std::unique_ptr<column> matches_re(strings_column_view const& strings,
return detail::matches_re(strings, pattern, flags, rmm::cuda_stream_default, mr);
}

namespace detail {
namespace {
/**
* @brief This counts the number of times the regex pattern matches in each string.
*/
template <int stack_size>
struct count_fn {
reprog_device prog;
column_device_view const d_strings;

__device__ int32_t operator()(unsigned int idx)
{
if (d_strings.is_null(idx)) return 0;
auto const d_str = d_strings.element<string_view>(idx);
auto const nchars = d_str.length();
int32_t find_count = 0;
int32_t begin = 0;
while (begin < nchars) {
auto end = static_cast<int32_t>(nchars);
if (prog.find<stack_size>(idx, d_str, begin, end) <= 0) break;
++find_count;
begin = end > begin ? end : begin + 1;
}
return find_count;
}
};

struct count_dispatch_fn {
reprog_device d_prog;

template <int stack_size>
std::unique_ptr<column> operator()(strings_column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto results = make_numeric_column(data_type{type_id::INT32},
input.size(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
input.null_count(),
stream,
mr);

auto const d_strings = column_device_view::create(input.parent(), stream);
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(input.size()),
results->mutable_view().data<int32_t>(),
count_fn<stack_size>{d_prog, *d_strings});
return results;
}
};

} // namespace

std::unique_ptr<column> count_re(
strings_column_view const& input,
std::string const& pattern,
regex_flags const flags,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
{
// compile regex into device object
auto d_prog =
reprog_device::create(pattern, flags, get_character_flags_table(), input.size(), stream);

return regex_dispatcher(*d_prog, count_dispatch_fn{*d_prog}, input, stream, mr);
}

} // namespace detail

// external API

std::unique_ptr<column> count_re(strings_column_view const& strings,
std::string const& pattern,
regex_flags const flags,
Expand Down
26 changes: 12 additions & 14 deletions cpp/src/strings/count_matches.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,16 @@ struct count_matches_fn {
__device__ size_type operator()(size_type idx)
{
if (d_strings.is_null(idx)) { return 0; }
size_type count = 0;
auto const d_str = d_strings.element<string_view>(idx);
size_type count = 0;
auto const d_str = d_strings.element<string_view>(idx);
auto const nchars = d_str.length();

int32_t begin = 0;
int32_t end = d_str.length();
int32_t end = nchars;
while ((begin < end) && (prog.find<stack_size>(idx, d_str, begin, end) > 0)) {
++count;
begin = end + (begin == end);
end = d_str.length();
end = nchars;
}
return count;
}
Expand All @@ -62,11 +63,14 @@ struct count_dispatch_fn {

template <int stack_size>
std::unique_ptr<column> operator()(column_device_view const& d_strings,
size_type output_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
assert(output_size >= d_strings.size() and "Unexpected output size");

auto results = make_numeric_column(
data_type{type_id::INT32}, d_strings.size() + 1, mask_state::UNALLOCATED, stream, mr);
data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED, stream, mr);

thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
Expand All @@ -80,21 +84,15 @@ struct count_dispatch_fn {
} // namespace

/**
* @brief Returns a column of regex match counts for each string in the given column.
*
* A null entry will result in a zero count for that output row.
*
* @param d_strings Device view of the input strings column.
* @param d_prog Regex instance to evaluate on each string.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @copydoc cudf::strings::detail::count_matches
*/
std::unique_ptr<column> count_matches(column_device_view const& d_strings,
reprog_device const& d_prog,
size_type output_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return regex_dispatcher(d_prog, count_dispatch_fn{d_prog}, d_strings, stream, mr);
return regex_dispatcher(d_prog, count_dispatch_fn{d_prog}, d_strings, output_size, stream, mr);
}

} // namespace detail
Expand Down
2 changes: 2 additions & 0 deletions cpp/src/strings/count_matches.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,14 @@ class reprog_device;
*
* @param d_strings Device view of the input strings column.
* @param d_prog Regex instance to evaluate on each string.
* @param output_size Number of rows for the output column.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory.
*/
std::unique_ptr<column> count_matches(
column_device_view const& d_strings,
reprog_device const& d_prog,
size_type output_size,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/strings/extract/extract_all.cu
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ std::unique_ptr<column> extract_all_record(

// Get the match counts for each string.
// This column will become the output lists child offsets column.
auto offsets = count_matches(*d_strings, *d_prog, stream, mr);
auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
auto d_offsets = offsets->mutable_view().data<offset_type>();

// Compute null output rows
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/strings/search/findall.cu
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ std::unique_ptr<table> findall(strings_column_view const& input,
reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);

auto const d_strings = column_device_view::create(input.parent(), stream);
auto find_counts = count_matches(*d_strings, *d_prog, stream);
auto find_counts = count_matches(*d_strings, *d_prog, strings_count + 1, stream);
auto d_find_counts = find_counts->view().data<size_type>();

size_type const columns_count = thrust::reduce(
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/strings/search/findall_record.cu
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ std::unique_ptr<column> findall_record(
reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);

// Create lists offsets column
auto offsets = count_matches(*d_strings, *d_prog, stream, mr);
auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
auto d_offsets = offsets->mutable_view().data<offset_type>();

// Convert counts into offsets
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/strings/split/split_re.cu
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
auto d_strings = column_device_view::create(input.parent(), stream);

// count the number of delimiters matched in each string
auto offsets = count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream);
auto offsets_view = offsets->mutable_view();
auto d_offsets = offsets_view.data<offset_type>();

Expand Down Expand Up @@ -287,7 +287,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
auto d_strings = column_device_view::create(input.parent(), stream);

// count the number of delimiters matched in each string
auto offsets = count_matches(*d_strings, *d_prog, stream, mr);
auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
auto offsets_view = offsets->mutable_view();

// get the split tokens from the input column; this also converts the counts into offsets
Expand Down
69 changes: 28 additions & 41 deletions cpp/src/table/row_operators.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,57 +85,44 @@ auto decompose_structs(table_view table,
auto const& col = table.column(col_idx);
if (is_nested(col.type())) {
// convert and insert
std::vector<column_view> r_verticalized_columns;
std::vector<int> r_verticalized_col_depths;
std::vector<column_view> flattened;
std::vector<int> depths;
// TODO: Here I added a bogus leaf column at the beginning to help in the while loop below.
// Refactor the while loop so that it can handle the last case.
flattened.push_back(make_empty_column(type_id::INT32)->view());
std::function<void(column_view const&, int)> recursive_child = [&](column_view const& c,
int depth) {
flattened.push_back(c);
depths.push_back(depth);
if (c.type().id() == type_id::STRUCT) {
for (int child_idx = 0; child_idx < c.num_children(); ++child_idx) {
auto scol = structs_column_view(c);
recursive_child(scol.get_sliced_child(child_idx), depth + 1);
std::vector<std::vector<column_view>> flattened;
std::function<void(column_view const&, std::vector<column_view>*, int)> recursive_child =
[&](column_view const& c, std::vector<column_view>* branch, int depth) {
branch->push_back(c);
if (c.type().id() == type_id::STRUCT) {
for (int child_idx = 0; child_idx < c.num_children(); ++child_idx) {
auto scol = structs_column_view(c);
if (child_idx > 0) {
verticalized_col_depths.push_back(depth + 1);
branch = &flattened.emplace_back();
}
recursive_child(scol.get_sliced_child(child_idx), branch, depth + 1);
}
}
}
};
recursive_child(col, 0);
int curr_col_idx = flattened.size() - 1;
column_view curr_col = flattened[curr_col_idx];
while (curr_col_idx > 0) {
auto const& prev_col = flattened[curr_col_idx - 1];
if (not is_nested(prev_col.type())) {
// We hit a column that's a leaf so seal this hierarchy
r_verticalized_columns.push_back(curr_col);
r_verticalized_col_depths.push_back(depths[curr_col_idx - 1]);
curr_col = prev_col;
} else {
curr_col = column_view(prev_col.type(),
prev_col.size(),
};
auto& branch = flattened.emplace_back();
verticalized_col_depths.push_back(0);
recursive_child(col, &branch, 0);

for (auto const& branch : flattened) {
column_view curr_col = branch.back();
for (auto it = branch.crbegin() + 1; it < branch.crend(); ++it) {
curr_col = column_view(it->type(),
it->size(),
nullptr,
prev_col.null_mask(),
it->null_mask(),
UNKNOWN_NULL_COUNT,
prev_col.offset(),
it->offset(),
{curr_col});
}
--curr_col_idx;
verticalized_columns.push_back(curr_col);
}
verticalized_columns.insert(
verticalized_columns.end(), r_verticalized_columns.rbegin(), r_verticalized_columns.rend());
verticalized_col_depths.insert(verticalized_col_depths.end(),
r_verticalized_col_depths.rbegin(),
r_verticalized_col_depths.rend());
if (not column_order.empty()) {
new_column_order.insert(
new_column_order.end(), r_verticalized_columns.size(), column_order[col_idx]);
new_column_order.insert(new_column_order.end(), flattened.size(), column_order[col_idx]);
}
if (not null_precedence.empty()) {
new_null_precedence.insert(
new_null_precedence.end(), r_verticalized_columns.size(), null_precedence[col_idx]);
new_null_precedence.end(), flattened.size(), null_precedence[col_idx]);
}
} else {
verticalized_columns.push_back(col);
Expand Down
Loading

0 comments on commit 3ee2f7b

Please sign in to comment.