Skip to content

Commit

Permalink
Merge pull request #4922 from davidwendt/perf-strings-split-to-many-c…
Browse files Browse the repository at this point in the history
…olumns

[REVIEW] Fix cudf::strings:split logic for many columns
  • Loading branch information
harrism authored Apr 27, 2020
2 parents 20bde03 + 5bc4ede commit 4063f4c
Show file tree
Hide file tree
Showing 9 changed files with 1,503 additions and 804 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@
- PR #4899 Fix series inplace handling
- PR #4940 Fix boolean mask issue with large sized Dataframe
- PR #4889 Fix multi-index merging
- PR #4922 Fix cudf::strings:split logic for many columns
- PR #4949 Fix scatter, gather benchmark constructor call
- PR #4965 Raise Error when there are duplicate columns sent to `cudf.concat`
- PR #4996 Parquet writer: fix potentially zero-sized string dictionary
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,7 @@ add_library(cudf
src/strings/sorting/sorting.cu
src/strings/split/partition.cu
src/strings/split/split.cu
src/strings/split/split_record.cu
src/strings/strings_column_factories.cu
src/strings/strings_column_view.cu
src/strings/strings_scalar_factories.cpp
Expand Down
104 changes: 104 additions & 0 deletions cpp/include/cudf/strings/detail/strings_column_factories.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/valid_if.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/utilities/error.hpp>
#include <strings/utilities.cuh>

#include <rmm/thrust_rmm_allocator.h>
#include <thrust/for_each.h>
#include <thrust/transform_reduce.h>

// clang-format off
namespace cudf {
namespace strings {
namespace detail {

// Create a strings-type column from vector of pointer/size pairs
template<typename IndexPairIterator>
std::unique_ptr<column> make_strings_column(
IndexPairIterator begin, IndexPairIterator end,
rmm::mr::device_memory_resource* mr,
cudaStream_t stream )
{
CUDF_FUNC_RANGE();
size_type strings_count = thrust::distance(begin,end);
if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream);

using string_index_pair = thrust::pair<const char*, size_type>;

auto execpol = rmm::exec_policy(stream);
// check total size is not too large for cudf column
size_t bytes = thrust::transform_reduce(
execpol->on(stream), begin, end,
[] __device__(string_index_pair const& item) {
return (item.first != nullptr) ? item.second : 0;
},
0,
thrust::plus<size_t>());
CUDF_EXPECTS(bytes < std::numeric_limits<size_type>::max(),
"total size of strings is too large for cudf column");

// build offsets column from the strings sizes
auto offsets_transformer = [begin] __device__(size_type idx) {
string_index_pair const item = begin[idx];
return (item.first != nullptr ? static_cast<int32_t>(item.second) : 0);
};
auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0), offsets_transformer);
auto offsets_column = strings::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
auto d_offsets = offsets_column->view().template data<int32_t>();

// create null mask
auto new_nulls = experimental::detail::valid_if( begin, end,
[] __device__(string_index_pair const item) { return item.first != nullptr; },
stream,
mr);
auto null_count = new_nulls.second;
rmm::device_buffer null_mask;
if (null_count > 0) null_mask = std::move(new_nulls.first);

// build chars column
auto chars_column =
strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
auto d_chars = chars_column->mutable_view().template data<char>();
thrust::for_each_n(execpol->on(stream),
thrust::make_counting_iterator<size_type>(0),
strings_count,
[begin, d_offsets, d_chars] __device__(size_type idx) {
string_index_pair const item = begin[idx];
if (item.first != nullptr)
memcpy(d_chars + d_offsets[idx], item.first, item.second);
});

return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
null_count,
std::move(null_mask),
stream,
mr);
}

} // namespace detail
} // namespace strings
} // namespace cudf
// clang-format on TODO fix
8 changes: 4 additions & 4 deletions cpp/include/cudf/strings/split/split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ namespace strings {
*
* Any null string entries return corresponding null output columns.
*
* @param strings Strings instance for this operation.
* @param strings_column Strings instance for this operation.
* @param delimiter UTF-8 encoded string indentifying the split points in each string.
* Default of empty string indicates split on whitespace.
* @param maxsplit Maximum number of splits to perform.
Expand All @@ -44,7 +44,7 @@ namespace strings {
* @return New table of strings columns.
*/
std::unique_ptr<experimental::table> split(
strings_column_view const& strings,
strings_column_view const& strings_column,
string_scalar const& delimiter = string_scalar(""),
size_type maxsplit = -1,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
Expand All @@ -63,7 +63,7 @@ std::unique_ptr<experimental::table> split(
*
* Any null string entries return corresponding null output columns.
*
* @param strings Strings instance for this operation.
* @param strings_column Strings instance for this operation.
* @param delimiter UTF-8 encoded string indentifying the split points in each string.
* Default of empty string indicates split on whitespace.
* @param maxsplit Maximum number of splits to perform.
Expand All @@ -72,7 +72,7 @@ std::unique_ptr<experimental::table> split(
* @return New strings columns.
*/
std::unique_ptr<experimental::table> rsplit(
strings_column_view const& strings,
strings_column_view const& strings_column,
string_scalar const& delimiter = string_scalar(""),
size_type maxsplit = -1,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/rolling/rolling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,7 @@ struct rolling_window_launcher {
// and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
auto output_table =
detail::gather(table_view{{input}}, output->view(), false, true, false, mr, stream);
return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
;
output = std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
}

return output;
Expand Down
Loading

0 comments on commit 4063f4c

Please sign in to comment.