Skip to content

Commit

Permalink
Merge branch 'branch-24.04' into feat/use_pylibcudf_groupby
Browse files Browse the repository at this point in the history
  • Loading branch information
vyasr authored Feb 3, 2024
2 parents 1ea0a8d + 6cebf22 commit 049787f
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 234 deletions.
80 changes: 0 additions & 80 deletions cpp/include/cudf/column/column_factories.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,63 +410,6 @@ std::unique_ptr<column> make_strings_column(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Construct a STRING type column given a device span of chars encoded as UTF-8, a device
* span of byte offsets identifying individual strings within the char vector, and an optional
* null bitmask.
*
* @deprecated Since 24.02
*
* `offsets.front()` must always be zero.
*
* The total number of char bytes must not exceed the maximum size of size_type. Use the
* strings_column_view class to perform strings operations on this type of column.
*
* This function makes a deep copy of the strings, offsets, null_mask to create a new column.
*
* @param strings The device span of chars in device memory. This char vector is expected to be
* UTF-8 encoded characters.
* @param offsets The device span of byte offsets in device memory. The number of elements is
* one more than the total number of strings so the `offsets.back()` is the total number of bytes
* in the strings array. `offsets.front()` must always be 0 to point to the beginning of `strings`.
* @param null_mask Device span containing the null element indicator bitmask. Arrow format for
* nulls is used for interpreting this bitmask.
* @param null_count The number of null string entries
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used for allocation of the column's `null_mask` and children
* columns' device memory
* @return Constructed strings column
*/
[[deprecated]] std::unique_ptr<column> make_strings_column(
cudf::device_span<char const> strings,
cudf::device_span<size_type const> offsets,
cudf::device_span<bitmask_type const> null_mask,
size_type null_count,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Construct a STRING type column given offsets column, chars columns, and null mask and null
* count.
*
* The columns and mask are moved into the resulting strings column.
*
* @param num_strings The number of strings the column represents.
* @param offsets_column The column of offset values for this column. The number of elements is
* one more than the total number of strings so the `offset[last] - offset[0]` is the total number
* of bytes in the strings vector.
* @param chars_column The column of char bytes for all the strings for this column. Individual
* strings are identified by the offsets and the nullmask.
* @param null_count The number of null string entries.
* @param null_mask The bits specifying the null strings in device memory. Arrow format for
* nulls is used for interpreting this bitmask.
* @return Constructed strings column
*/
[[deprecated]] std::unique_ptr<column> make_strings_column(size_type num_strings,
std::unique_ptr<column> offsets_column,
std::unique_ptr<column> chars_column,
size_type null_count,
rmm::device_buffer&& null_mask);
/**
* @brief Construct a STRING type column given offsets column, chars columns, and null mask and null
* count.
Expand All @@ -490,29 +433,6 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
size_type null_count,
rmm::device_buffer&& null_mask);

/**
* @brief Construct a STRING type column given offsets, columns, and optional null count and null
* mask.
*
* @deprecated Since 24.02
*
* @param[in] num_strings The number of strings the column represents.
* @param[in] offsets The offset values for this column. The number of elements is one more than the
* total number of strings so the `offset[last] - offset[0]` is the total number of bytes in the
* strings vector.
* @param[in] chars The char bytes for all the strings for this column. Individual strings are
* identified by the offsets and the nullmask.
* @param[in] null_mask The bits specifying the null strings in device memory. Arrow format for
* nulls is used for interpreting this bitmask.
* @param[in] null_count The number of null string entries.
* @return Constructed strings column
*/
[[deprecated]] std::unique_ptr<column> make_strings_column(size_type num_strings,
rmm::device_uvector<size_type>&& offsets,
rmm::device_uvector<char>&& chars,
rmm::device_buffer&& null_mask,
size_type null_count);

/**
* @brief Construct a LIST type column given offsets column, child column, null mask and null
* count.
Expand Down
10 changes: 0 additions & 10 deletions cpp/include/cudf/strings/strings_column_view.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,6 @@ class strings_column_view : private column_view {
*/
[[nodiscard]] offset_iterator offsets_end() const;

/**
* @brief Returns the internal column of chars
*
* @throw cudf::logic_error if this is an empty column
* @param stream CUDA stream used for device memory operations and kernel launches
* @return The chars column
*/
[[deprecated]] [[nodiscard]] column_view chars(
rmm::cuda_stream_view stream = cudf::get_default_stream()) const;

/**
* @brief Returns the number of bytes in the chars child column.
*
Expand Down
39 changes: 11 additions & 28 deletions cpp/src/strings/convert/convert_urls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/get_value.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/strings/convert/convert_urls.hpp>
Expand All @@ -34,10 +34,6 @@

#include <cub/cub.cuh>

#include <thrust/scan.h>

#include <algorithm>

namespace cudf {
namespace strings {
namespace detail {
Expand Down Expand Up @@ -282,7 +278,7 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
template <size_type num_warps_per_threadblock, size_type char_block_size>
CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
char* const out_chars,
size_type const* const out_offsets)
cudf::detail::input_offsetalator const out_offsets)
{
constexpr int halo_size = 2;
__shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size * 2];
Expand Down Expand Up @@ -384,38 +380,25 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
auto const num_threadblocks =
std::min(65536, cudf::util::div_rounding_up_unsafe(strings_count, num_warps_per_threadblock));

auto offset_count = strings_count + 1;
auto const d_strings = column_device_view::create(strings.parent(), stream);

// build offsets column
auto offsets_column = make_numeric_column(
data_type{type_to_id<size_type>()}, offset_count, mask_state::UNALLOCATED, stream, mr);

// count number of bytes in each string after decoding and store it in offsets_column
auto offsets_view = offsets_column->view();
auto offsets_mutable_view = offsets_column->mutable_view();
// build offsets column by computing the output row sizes and scanning the results
auto row_sizes = rmm::device_uvector<size_type>(strings_count, stream);
url_decode_char_counter<num_warps_per_threadblock, char_block_size>
<<<num_threadblocks, threadblock_size, 0, stream.value()>>>(
*d_strings, offsets_mutable_view.begin<size_type>());

// use scan to transform number of bytes into offsets
thrust::exclusive_scan(rmm::exec_policy(stream),
offsets_view.begin<size_type>(),
offsets_view.end<size_type>(),
offsets_mutable_view.begin<size_type>());

// copy the total number of characters of all strings combined (last element of the offset column)
// to the host memory
auto out_chars_bytes = cudf::detail::get_value<size_type>(offsets_view, offset_count - 1, stream);
<<<num_threadblocks, threadblock_size, 0, stream.value()>>>(*d_strings, row_sizes.data());
// performs scan on the sizes and builds the appropriate offsets column
auto [offsets_column, out_chars_bytes] = cudf::strings::detail::make_offsets_child_column(
row_sizes.begin(), row_sizes.end(), stream, mr);

// create the chars column
rmm::device_uvector<char> chars(out_chars_bytes, stream, mr);
auto d_out_chars = chars.data();
auto const offsets =
cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());

// decode and copy the characters from the input column to the output column
url_decode_char_replacer<num_warps_per_threadblock, char_block_size>
<<<num_threadblocks, threadblock_size, 0, stream.value()>>>(
*d_strings, d_out_chars, offsets_column->view().begin<size_type>());
<<<num_threadblocks, threadblock_size, 0, stream.value()>>>(*d_strings, d_out_chars, offsets);

// copy null mask
rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
Expand Down
70 changes: 0 additions & 70 deletions cpp/src/strings/strings_column_factories.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,25 +56,6 @@ std::unique_ptr<column> make_strings_column(
return cudf::strings::detail::make_strings_column(strings.begin(), strings.end(), stream, mr);
}

std::unique_ptr<column> make_strings_column(device_span<char> chars,
device_span<size_type> offsets,
size_type null_count,
rmm::device_buffer&& null_mask,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();

return cudf::strings::detail::make_strings_column(chars.begin(),
chars.end(),
offsets.begin(),
offsets.end(),
null_count,
std::move(null_mask),
stream,
mr);
}

std::unique_ptr<column> make_strings_column(device_span<string_view const> string_views,
string_view null_placeholder,
rmm::cuda_stream_view stream,
Expand All @@ -88,57 +69,6 @@ std::unique_ptr<column> make_strings_column(device_span<string_view const> strin
it_pair, it_pair + string_views.size(), stream, mr);
}

// Create a strings-type column from device vector of chars and vector of offsets.
std::unique_ptr<column> make_strings_column(cudf::device_span<char const> strings,
cudf::device_span<size_type const> offsets,
cudf::device_span<bitmask_type const> valid_mask,
size_type null_count,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();

// build null bitmask
rmm::device_buffer null_mask{
valid_mask.data(), valid_mask.size() * sizeof(bitmask_type), stream, mr};

return cudf::strings::detail::make_strings_column(strings.begin(),
strings.end(),
offsets.begin(),
offsets.end(),
null_count,
std::move(null_mask),
stream,
mr);
}

//
std::unique_ptr<column> make_strings_column(size_type num_strings,
std::unique_ptr<column> offsets_column,
std::unique_ptr<column> chars_column,
size_type null_count,
rmm::device_buffer&& null_mask)
{
CUDF_FUNC_RANGE();

if (num_strings == 0) { return make_empty_column(type_id::STRING); }

if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable.");
CUDF_EXPECTS(num_strings == offsets_column->size() - 1,
"Invalid offsets column size for strings column.");
CUDF_EXPECTS(offsets_column->null_count() == 0, "Offsets column should not contain nulls");
CUDF_EXPECTS(chars_column->null_count() == 0, "Chars column should not contain nulls");

std::vector<std::unique_ptr<column>> children;
children.emplace_back(std::move(offsets_column));
return std::make_unique<column>(data_type{type_id::STRING},
num_strings,
std::move(*(chars_column->release().data.release())),
std::move(null_mask),
null_count,
std::move(children));
}

std::unique_ptr<column> make_strings_column(size_type num_strings,
std::unique_ptr<column> offsets_column,
rmm::device_buffer&& chars_buffer,
Expand Down
7 changes: 0 additions & 7 deletions cpp/src/strings/strings_column_view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,6 @@ strings_column_view::offset_iterator strings_column_view::offsets_end() const
return offsets_begin() + size() + 1;
}

column_view strings_column_view::chars(rmm::cuda_stream_view stream) const
{
CUDF_EXPECTS(num_children() > 0, "strings column has no children");
return column_view(
data_type{type_id::INT8}, chars_size(stream), chars_begin(stream), nullptr, 0, 0);
}

size_type strings_column_view::chars_size(rmm::cuda_stream_view stream) const noexcept
{
if (size() == 0) return 0;
Expand Down
Loading

0 comments on commit 049787f

Please sign in to comment.