Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-22.12' into dataframe-p…
Browse files Browse the repository at this point in the history
…ivot_table
  • Loading branch information
bdice committed Oct 27, 2022
2 parents 05a748a + bac2004 commit fc2f3e7
Show file tree
Hide file tree
Showing 13 changed files with 641 additions and 251 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,5 @@ dask-worker-space/
# Sphinx docs & build artifacts
docs/cudf/source/api_docs/generated/*
docs/cudf/source/api_docs/api/*
docs/cudf/source/user_guide/example_output/*
docs/cudf/source/user_guide/cudf.*Dtype.*.rst
5 changes: 4 additions & 1 deletion cpp/benchmarks/io/text/multibyte_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ static void bench_multibyte_split(nvbench::state& state,
auto const delim_percent = state.get_int64("delim_percent");
auto const file_size_approx = state.get_int64("size_approx");
auto const byte_range_percent = state.get_int64("byte_range_percent");
auto const strip_delimiters = bool(state.get_int64("strip_delimiters"));

auto const byte_range_factor = static_cast<double>(byte_range_percent) / 100;
CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1");
Expand Down Expand Up @@ -182,12 +183,13 @@ static void bench_multibyte_split(nvbench::state& state,
auto const range_size = static_cast<int64_t>(device_input.size() * byte_range_factor);
auto const range_offset = (device_input.size() - range_size) / 2;
cudf::io::text::byte_range_info range{range_offset, range_size};
cudf::io::text::parse_options options{range, strip_delimiters};
std::unique_ptr<cudf::column> output;

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
try_drop_l3_cache();
output = cudf::io::text::multibyte_split(*source, delim, range);
output = cudf::io::text::multibyte_split(*source, delim, options);
});

state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage");
Expand All @@ -203,6 +205,7 @@ using source_type_list = nvbench::enum_type_list<data_chunk_source_type::device,

NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list))
.set_name("multibyte_split")
.add_int64_axis("strip_delimiters", {0, 1})
.add_int64_axis("delim_size", {1, 4, 7})
.add_int64_axis("delim_percent", {1, 25})
.add_int64_power_of_two_axis("size_approx", {15, 30})
Expand Down
30 changes: 25 additions & 5 deletions cpp/include/cudf/io/text/multibyte_split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,25 @@ namespace cudf {
namespace io {
namespace text {

/**
* @brief Parsing options for multibyte_split.
*/
struct parse_options {
/**
* @brief Only rows starting inside this byte range will be part of the output column.
*/
byte_range_info byte_range = create_byte_range_info_max();
/**
* @brief Whether delimiters at the end of rows should be stripped from the output column
*/
bool strip_delimiters = false;
};

/**
* @brief Splits the source text into a strings column using a multiple byte delimiter.
*
* Providing a byte range allows multibyte_split to read a whole file, but only return the offsets
* of delimiters which begin within the range. If thinking in terms of "records", where each
* Providing a byte range allows multibyte_split to read a file partially, only returning the
* offsets of delimiters which begin within the range. If thinking in terms of "records", where each
* delimiter dictates the end of a record, all records which begin within the byte range provided
* will be returned, including any record which may begin in the range but end outside of the
* range. Records which begin outside of the range will ignored, even if those records end inside
Expand Down Expand Up @@ -63,16 +77,22 @@ namespace text {
*
* @param source The source string
* @param delimiter UTF-8 encoded string for which to find offsets in the source
* @param byte_range range in which to consider offsets relevant
* @param options the parsing options to use (including byte range)
* @param mr Memory resource to use for the device memory allocation
* @return The strings found by splitting the source by the delimiter within the relevant byte
* range.
*/
std::unique_ptr<cudf::column> multibyte_split(
data_chunk_source const& source,
std::string const& delimiter,
std::optional<byte_range_info> byte_range = std::nullopt,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
parse_options options = {},
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<cudf::column> multibyte_split(
data_chunk_source const& source,
std::string const& delimiter,
std::optional<byte_range_info> byte_range,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
std::string const& delimiter,
Expand Down
Loading

0 comments on commit fc2f3e7

Please sign in to comment.