Skip to content

Commit

Permalink
Merge branch 'branch-24.04' into json-whitespace
Browse files Browse the repository at this point in the history
  • Loading branch information
shrshi authored Mar 4, 2024
2 parents d4b1b26 + dbdcc31 commit ff91891
Show file tree
Hide file tree
Showing 59 changed files with 620 additions and 289 deletions.
3 changes: 1 addition & 2 deletions cpp/benchmarks/json/json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,7 @@ auto build_json_string_column(int desired_bytes, int num_rows)
desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
auto [offsets, chars] = cudf::strings::detail::make_strings_children(
jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
return cudf::make_strings_column(
num_rows, std::move(offsets), std::move(chars->release().data.release()[0]), 0, {});
return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
}

void BM_case(benchmark::State& state, std::string query_arg)
Expand Down
11 changes: 5 additions & 6 deletions cpp/include/cudf/strings/detail/strings_children.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ namespace strings {
namespace detail {

/**
* @brief Creates child offsets and chars columns by applying the template function that
* @brief Creates child offsets and chars data by applying the template function that
* can be used for computing the output size of each string as well as create the output
*
* @throws std::overflow_error if the output strings column exceeds the column size limit
Expand All @@ -49,7 +49,7 @@ namespace detail {
* @param strings_count Number of strings.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned columns' device memory.
* @return offsets child column and chars child column for a strings column
* @return Offsets child column and chars data for a strings column
*/
template <typename SizeAndExecuteFunction>
auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
Expand Down Expand Up @@ -84,18 +84,17 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
std::overflow_error);

// Now build the chars column
std::unique_ptr<column> chars_column =
create_chars_child_column(static_cast<size_type>(bytes), stream, mr);
rmm::device_uvector<char> chars(bytes, stream, mr);

// Execute the function fn again to fill the chars column.
// Note that if the output chars column has zero size, the function fn should not be called to
// avoid accidentally overwriting the offsets.
if (bytes > 0) {
size_and_exec_fn.d_chars = chars_column->mutable_view().template data<char>();
size_and_exec_fn.d_chars = chars.data();
for_each_fn(size_and_exec_fn);
}

return std::pair(std::move(offsets_column), std::move(chars_column));
return std::pair(std::move(offsets_column), std::move(chars));
}

/**
Expand Down
13 changes: 6 additions & 7 deletions cpp/include/cudf/table/row_operators.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,19 +20,16 @@
#include <cudf/detail/utilities/assert.cuh>
#include <cudf/hashing/detail/hash_functions.cuh>
#include <cudf/hashing/detail/hashing.hpp>
#include <cudf/sorting.hpp>
#include <cudf/table/table_device_view.cuh>
#include <cudf/utilities/traits.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

#include <cuda/std/limits>
#include <thrust/equal.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/swap.h>
#include <thrust/transform_reduce.h>

#include <limits>

namespace cudf {

/**
Expand Down Expand Up @@ -470,7 +467,9 @@ class element_hasher {
template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
__device__ hash_value_type operator()(column_device_view col, size_type row_index) const
{
if (has_nulls && col.is_null(row_index)) { return std::numeric_limits<hash_value_type>::max(); }
if (has_nulls && col.is_null(row_index)) {
return cuda::std::numeric_limits<hash_value_type>::max();
}
return hash_function<T>{}(col.element<T>(row_index));
}

Expand Down Expand Up @@ -554,7 +553,7 @@ class element_hasher_with_seed {

private:
uint32_t _seed{DEFAULT_HASH_SEED};
hash_value_type _null_hash{std::numeric_limits<hash_value_type>::max()};
hash_value_type _null_hash{cuda::std::numeric_limits<hash_value_type>::max()};
Nullate _has_nulls;
};

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/csv/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -180,12 +180,12 @@ struct column_to_strings_fn {

auto d_column = column_device_view::create(column_v, stream_);
escape_strings_fn fn{*d_column, delimiter.value(stream_)};
auto [offsets_column, chars_column] =
auto [offsets_column, chars] =
cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_);

return make_strings_column(column_v.size(),
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
chars.release(),
column_v.null_count(),
cudf::detail::copy_bitmask(column_v, stream_, mr_));
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/io/json/write_json.cu
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,12 @@ struct escape_strings_fn {
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto [offsets_column, chars_column] =
auto [offsets_column, chars] =
cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);

return make_strings_column(column_v.size(),
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
chars.release(),
column_v.null_count(),
cudf::detail::copy_bitmask(column_v, stream, mr));
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/lists/interleave_columns.cu
Original file line number Diff line number Diff line change
Expand Up @@ -228,15 +228,15 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
rmm::device_uvector<int8_t>(data_has_null_mask ? num_output_entries : 0, stream);
comp_fn.d_validities = validities.data();

auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(
auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
comp_fn, num_output_lists, num_output_entries, stream, mr);

auto [null_mask, null_count] =
cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);

return make_strings_column(num_output_entries,
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
chars.release(),
null_count,
std::move(null_mask));
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/replace/clamp.cu
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,12 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp

auto fn = clamp_strings_fn<OptionalScalarIterator, ReplaceScalarIterator>{
d_input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr};
auto [offsets_column, chars_column] =
auto [offsets_column, chars] =
cudf::strings::detail::make_strings_children(fn, input.size(), stream, mr);

return make_strings_column(input.size(),
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
chars.release(),
input.null_count(),
std::move(cudf::detail::copy_bitmask(input.parent(), stream, mr)));
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/strings/capitalize.cu
Original file line number Diff line number Diff line change
Expand Up @@ -229,12 +229,12 @@ std::unique_ptr<column> capitalizer(CapitalFn cfn,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto [offsets_column, chars_column] =
auto [offsets_column, chars] =
cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);

return make_strings_column(input.size(),
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
chars.release(),
input.null_count(),
cudf::detail::copy_bitmask(input.parent(), stream, mr));
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/strings/case.cu
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
return make_strings_column(input.size(),
std::move(offsets),
std::move(chars->release().data.release()[0]),
chars.release(),
input.null_count(),
cudf::detail::copy_bitmask(input.parent(), stream, mr));
}
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/strings/char_types/char_types.cu
Original file line number Diff line number Diff line change
Expand Up @@ -200,13 +200,13 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);

// this utility calls filterer to build the offsets and chars columns
auto [offsets_column, chars_column] =
auto [offsets_column, chars] =
cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);

// return new strings column
return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
chars.release(),
strings.null_count(),
std::move(null_mask));
}
Expand Down
18 changes: 6 additions & 12 deletions cpp/src/strings/combine/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
// Create device views from the strings columns.
auto d_table = table_device_view::create(strings_columns, stream);
concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls};
auto [offsets_column, chars_column] = make_strings_children(fn, strings_count, stream, mr);
auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);

// create resulting null mask
auto [null_mask, null_count] = cudf::detail::valid_if(
Expand All @@ -156,11 +156,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
stream,
mr);

return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
null_count,
std::move(null_mask));
return make_strings_column(
strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
}

namespace {
Expand Down Expand Up @@ -237,7 +234,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,

multi_separator_concat_fn mscf{
*d_table, separator_col_view, separator_rep, col_rep, separate_nulls};
auto [offsets_column, chars_column] = make_strings_children(mscf, strings_count, stream, mr);
auto [offsets_column, chars] = make_strings_children(mscf, strings_count, stream, mr);

// Create resulting null mask
auto [null_mask, null_count] = cudf::detail::valid_if(
Expand All @@ -252,11 +249,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
stream,
mr);

return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
null_count,
std::move(null_mask));
return make_strings_column(
strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
}

} // namespace detail
Expand Down
35 changes: 19 additions & 16 deletions cpp/src/strings/combine/join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -142,28 +142,34 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,

auto d_strings = column_device_view::create(input.parent(), stream);

auto chars_column = [&] {
auto chars = [&] {
// build the strings column and commandeer the chars column
if ((input.size() == input.null_count()) ||
((input.chars_size(stream) / (input.size() - input.null_count())) <=
AVG_CHAR_BYTES_THRESHOLD)) {
return std::get<1>(
make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr));
return std::get<1>(make_strings_children(
join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr))
.release();
}
// dynamically feeds index pairs to build the output
auto indices = cudf::detail::make_counting_transform_iterator(
0, join_gather_fn{*d_strings, d_separator, d_narep});
auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr);
auto chars_data = joined_col->release().data;
auto const chars_size = chars_data->size();
return std::make_unique<cudf::column>(
data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0);
auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr);
auto chars_data = joined_col->release().data;
return std::move(*chars_data);
}();

// build the offsets: single string output has offsets [0,chars-size]
auto offsets = cudf::detail::make_device_uvector_async(
std::vector<size_type>({0, chars_column->size()}), stream, mr);
auto offsets_column = std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
auto offsets_column = [&] {
if (chars.size() < static_cast<std::size_t>(get_offset64_threshold())) {
auto offsets32 = cudf::detail::make_device_uvector_async(
std::vector<int32_t>({0, static_cast<int32_t>(chars.size())}), stream, mr);
return std::make_unique<column>(std::move(offsets32), rmm::device_buffer{}, 0);
}
auto offsets64 = cudf::detail::make_device_uvector_async(
std::vector<int64_t>({0L, static_cast<int64_t>(chars.size())}), stream, mr);
return std::make_unique<column>(std::move(offsets64), rmm::device_buffer{}, 0);
}();

// build the null mask: only one output row so it is either all-valid or all-null
auto const null_count =
Expand All @@ -173,11 +179,8 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
: rmm::device_buffer{0, stream, mr};

// perhaps this return a string_scalar instead of a single-row column
return make_strings_column(1,
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
null_count,
std::move(null_mask));
return make_strings_column(
1, std::move(offsets_column), std::move(chars), null_count, std::move(null_mask));
}

} // namespace detail
Expand Down
18 changes: 6 additions & 12 deletions cpp/src/strings/combine/join_list_elements.cu
Original file line number Diff line number Diff line change
Expand Up @@ -207,19 +207,16 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
separate_nulls,
empty_list_policy};

auto [offsets_column, chars_column] = make_strings_children(comp_fn, num_rows, stream, mr);
auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
auto [null_mask, null_count] =
cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_rows),
validities_fn{comp_fn},
stream,
mr);

return make_strings_column(num_rows,
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
null_count,
std::move(null_mask));
return make_strings_column(
num_rows, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
}

namespace {
Expand Down Expand Up @@ -285,19 +282,16 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
separate_nulls,
empty_list_policy};

auto [offsets_column, chars_column] = make_strings_children(comp_fn, num_rows, stream, mr);
auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
auto [null_mask, null_count] =
cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
thrust::counting_iterator<size_type>(num_rows),
validities_fn{comp_fn},
stream,
mr);

return make_strings_column(num_rows,
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
null_count,
std::move(null_mask));
return make_strings_column(
num_rows, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
}

} // namespace detail
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/strings/convert/convert_booleans.cu
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,

return make_strings_column(strings_count,
std::move(offsets),
std::move(chars->release().data.release()[0]),
chars.release(),
booleans.null_count(),
std::move(null_mask));
}
Expand Down
18 changes: 9 additions & 9 deletions cpp/src/strings/convert/convert_datetime.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1099,7 +1099,7 @@ struct datetime_formatter_fn {
};

//
using strings_children = std::pair<std::unique_ptr<cudf::column>, std::unique_ptr<cudf::column>>;
using strings_children = std::pair<std::unique_ptr<cudf::column>, rmm::device_uvector<char>>;
struct dispatch_from_timestamps_fn {
template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
strings_children operator()(column_device_view const& d_timestamps,
Expand Down Expand Up @@ -1148,17 +1148,17 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
auto const d_timestamps = column_device_view::create(timestamps, stream);

// dispatcher is called to handle the different timestamp types
auto [offsets_column, chars_column] = cudf::type_dispatcher(timestamps.type(),
dispatch_from_timestamps_fn(),
*d_timestamps,
*d_names,
d_format_items,
stream,
mr);
auto [offsets_column, chars] = cudf::type_dispatcher(timestamps.type(),
dispatch_from_timestamps_fn(),
*d_timestamps,
*d_names,
d_format_items,
stream,
mr);

return make_strings_column(timestamps.size(),
std::move(offsets_column),
std::move(chars_column->release().data.release()[0]),
chars.release(),
timestamps.null_count(),
cudf::detail::copy_bitmask(timestamps, stream, mr));
}
Expand Down
Loading

0 comments on commit ff91891

Please sign in to comment.