Skip to content

Commit

Permalink
Merge branch 'branch-0.20' into dictionary-enable-groupby-sum
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Apr 26, 2021
2 parents 6c92e52 + 94afdda commit 394e424
Show file tree
Hide file tree
Showing 17 changed files with 1,216 additions and 136 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@ add_library(cudf
src/strings/char_types/char_cases.cu
src/strings/char_types/char_types.cu
src/strings/combine/concatenate.cu
src/strings/combine/concatenate_list_elements.cu
src/strings/combine/join.cu
src/strings/contains.cu
src/strings/convert/convert_booleans.cu
Expand Down
179 changes: 138 additions & 41 deletions cpp/include/cudf/strings/combine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table_view.hpp>
Expand All @@ -29,47 +30,6 @@ namespace strings {
* @brief Strings APIs for concatenate and join
*/

/**
* @brief Row-wise concatenates the given list of strings columns and
* returns a single strings column result.
*
* Each new string is created by concatenating the strings from the same
* row delimited by the separator provided.
*
* Any row with a null entry will result in the corresponding output
* row to be null entry unless a narep string is specified to be used
* in its place.
*
* The number of strings in the columns provided must be the same.
*
* @code{.pseudo}
* Example:
* s1 = ['aa', null, '', 'aa']
* s2 = ['', 'bb', 'bb', null]
* r1 = concatenate([s1,s2])
* r1 is ['aa', null, 'bb', null]
* r2 = concatenate([s1,s2],':','_')
* r2 is ['aa:', '_:bb', ':bb', 'aa:_']
* @endcode
*
* @throw cudf::logic_error if input columns are not all strings columns.
* @throw cudf::logic_error if separator is not valid.
*
* @param strings_columns List of string columns to concatenate.
* @param separator String that should inserted between each string from each row.
* Default is an empty string.
* @param narep String that should be used in place of any null strings
* found in any column. Default of invalid-scalar means any null entry in any column will
* produces a null result for that row.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column with concatenated results.
*/
std::unique_ptr<column> concatenate(
table_view const& strings_columns,
string_scalar const& separator = string_scalar(""),
string_scalar const& narep = string_scalar("", false),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Concatenates all strings in the column into one new string delimited
* by an optional separator string.
Expand Down Expand Up @@ -158,6 +118,143 @@ std::unique_ptr<column> concatenate(
string_scalar const& col_narep = string_scalar("", false),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @addtogroup strings_combine
* @{
* @file strings/combine.hpp
* @brief Strings APIs for concatenate and join
*/

/**
* @brief Row-wise concatenates the given list of strings columns and
* returns a single strings column result.
*
* Each new string is created by concatenating the strings from the same
* row delimited by the separator provided.
*
* Any row with a null entry will result in the corresponding output
* row to be null entry unless a narep string is specified to be used
* in its place.
*
* The number of strings in the columns provided must be the same.
*
* @code{.pseudo}
* Example:
* s1 = ['aa', null, '', 'aa']
* s2 = ['', 'bb', 'bb', null]
* r1 = concatenate([s1,s2])
* r1 is ['aa', null, 'bb', null]
* r2 = concatenate([s1,s2],':','_')
* r2 is ['aa:', '_:bb', ':bb', 'aa:_']
* @endcode
*
* @throw cudf::logic_error if input columns are not all strings columns.
* @throw cudf::logic_error if separator is not valid.
*
* @param strings_columns List of string columns to concatenate.
* @param separator String that should inserted between each string from each row.
* Default is an empty string.
* @param narep String that should be used in place of any null strings
* found in any column. Default of invalid-scalar means any null entry in any column will
* produces a null result for that row.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column with concatenated results.
*/
std::unique_ptr<column> concatenate(
table_view const& strings_columns,
string_scalar const& separator = string_scalar(""),
string_scalar const& narep = string_scalar("", false),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
* within each row and returns a single strings column result.
*
* Each new string is created by concatenating the strings from the same row (same list element)
* delimited by the row separator provided in the `separators` strings column.
*
* A null list row will always result in a null string in the output row. Any non-null list row
* having a null element will result in the corresponding output row to be null unless a valid
* `string_narep` scalar is provided to be used in its place. Any null row in the `separators`
* column will also result in a null output row unless a valid `separator_narep` scalar is provided
* to be used in place of the null separators.
*
* @code{.pseudo}
* Example:
* s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff', 'gg'} ]
* sep = ['::', '%%', '!', '*', null]
*
* r1 = concatenate(s, sep)
* r1 is ['aa::bb::cc', null, '!dd', null, null]
*
* r2 = concatenate(s, sep, ':', '_')
* r2 is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg']
* @endcode
*
* @throw cudf::logic_error if input column is not lists of strings column.
* @throw cudf::logic_error if the number of rows from `separators` and `lists_strings_column` do
* not match
*
* @param lists_strings_column Column containing lists of strings to concatenate
* @param separators Strings column that provides separators for concatenation
* @param separator_narep String that should be used to replace null separator, default is an
* invalid-scalar denoting that rows containing null separator will result in null string in the
* corresponding output rows
* @param string_narep String that should be used to replace null strings in any
* non-null list row, default is an invalid-scalar denoting that list rows containing null strings
* will result in null string in the corresponding output rows
* @param mr Device memory resource used to allocate the returned column's
* device memory
* @return New strings column with concatenated results
*/
std::unique_ptr<column> concatenate_list_elements(
const lists_column_view& lists_strings_column,
const strings_column_view& separators,
string_scalar const& separator_narep = string_scalar("", false),
string_scalar const& string_narep = string_scalar("", false),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
* within each row and returns a single strings column result.
*
* Each new string is created by concatenating the strings from the same row (same list element)
* delimited by the separator provided.
*
* A null list row will always result in a null string in the output row. Any non-null list row
* having a null elenent will result in the corresponding output row to be null unless a narep
* string is specified to be used in its place.
*
* @code{.pseudo}
* Example:
* s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff'} ]
*
* r1 = concatenate(s)
* r1 is ['aabbcc', null, 'dd', null, 'ff']
*
* r2 = concatenate(s, ':', '_')
* r2 is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff']
* @endcode
*
* @throw cudf::logic_error if input column is not lists of strings column.
* @throw cudf::logic_error if separator is not valid.
*
* @param lists_strings_column Column containing lists of strings to concatenate
* @param separator String that should inserted between strings of each list row,
* default is an empty string
* @param narep String that should be used to replace null strings in any non-null
* list row, default is an invalid-scalar denoting that list rows containing null strings will
* result in null string in the corresponding output rows
* @param mr Device memory resource used to allocate the returned column's
* device memory
* @return New strings column with concatenated results
*/
std::unique_ptr<column> concatenate_list_elements(
const lists_column_view& lists_strings_column,
string_scalar const& separator = string_scalar(""),
string_scalar const& narep = string_scalar("", false),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
16 changes: 13 additions & 3 deletions cpp/src/io/avro/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -367,9 +367,19 @@ table_with_metadata reader::impl::read(avro_reader_options const &options,
}

if (_metadata->total_data_size > 0) {
const auto buffer =
_source->host_read(_metadata->block_list[0].offset, _metadata->total_data_size);
rmm::device_buffer block_data(buffer->data(), buffer->size(), stream);
rmm::device_buffer block_data;
if (_source->is_device_read_preferred(_metadata->total_data_size)) {
block_data = rmm::device_buffer{_metadata->total_data_size, stream};
auto read_bytes = _source->device_read(_metadata->block_list[0].offset,
_metadata->total_data_size,
static_cast<uint8_t *>(block_data.data()),
stream);
block_data.resize(read_bytes);
} else {
const auto buffer =
_source->host_read(_metadata->block_list[0].offset, _metadata->total_data_size);
block_data = rmm::device_buffer{buffer->data(), buffer->size(), stream};
}

if (_metadata->codec != "" && _metadata->codec != "null") {
auto decomp_block_data = decompress_data(block_data, stream);
Expand Down
14 changes: 10 additions & 4 deletions cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -504,10 +504,16 @@ table_with_metadata reader::impl::read(size_type skip_rows,
len += stream_info[stream_count].length;
stream_count++;
}
const auto buffer = _source->host_read(offset, len);
CUDA_TRY(
cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value()));
stream.synchronize();
if (_source->is_device_read_preferred(len)) {
CUDF_EXPECTS(_source->device_read(offset, len, d_dst, stream) == len,
"Unexpected discrepancy in bytes read.");
} else {
const auto buffer = _source->host_read(offset, len);
CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
CUDA_TRY(
cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value()));
stream.synchronize();
}
}

// Update chunks to reference streams pointers
Expand Down
32 changes: 21 additions & 11 deletions cpp/src/io/orc/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -929,10 +929,14 @@ void writer::impl::write_data_stream(gpu::StripeStream const &strm_desc,
{
const auto length = strm_desc.stream_size;
(*streams)[enc_stream.ids[strm_desc.stream_type]].length = length;
if (length != 0) {
const auto *stream_in = (compression_kind_ == NONE)
? enc_stream.data_ptrs[strm_desc.stream_type]
: (compressed_data + strm_desc.bfr_offset);
if (length == 0) { return; }

const auto *stream_in = (compression_kind_ == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
: (compressed_data + strm_desc.bfr_offset);

if (out_sink_->is_device_write_preferred(length)) {
out_sink_->device_write(stream_in, length, stream);
} else {
CUDA_TRY(
cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDeviceToHost, stream.value()));
stream.synchronize();
Expand Down Expand Up @@ -1105,11 +1109,13 @@ void writer::impl::write(table_view const &table)
size_t num_compressed_blocks = 0;
auto stream_output = [&]() {
size_t max_stream_size = 0;
bool all_device_write = true;

for (size_t stripe_id = 0; stripe_id < stripe_bounds.size(); stripe_id++) {
for (size_t i = 0; i < num_data_streams; i++) { // TODO range for (at least)
gpu::StripeStream *ss = &strm_descs[stripe_id][i];
size_t stream_size = ss->stream_size;
if (!out_sink_->is_device_write_preferred(ss->stream_size)) { all_device_write = false; }
size_t stream_size = ss->stream_size;
if (compression_kind_ != NONE) {
ss->first_block = num_compressed_blocks;
ss->bfr_offset = compressed_bfr_size;
Expand All @@ -1124,12 +1130,16 @@ void writer::impl::write(table_view const &table)
}
}

return pinned_buffer<uint8_t>{[](size_t size) {
uint8_t *ptr = nullptr;
CUDA_TRY(cudaMallocHost(&ptr, size));
return ptr;
}(max_stream_size),
cudaFreeHost};
if (all_device_write) {
return pinned_buffer<uint8_t>{nullptr, cudaFreeHost};
} else {
return pinned_buffer<uint8_t>{[](size_t size) {
uint8_t *ptr = nullptr;
CUDA_TRY(cudaMallocHost(&ptr, size));
return ptr;
}(max_stream_size),
cudaFreeHost};
}
}();

// Compress the data streams
Expand Down
Loading

0 comments on commit 394e424

Please sign in to comment.