diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 89f0ebeb239..ef7b1e4e2e7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -323,6 +323,7 @@ add_library(cudf src/strings/char_types/char_cases.cu src/strings/char_types/char_types.cu src/strings/combine/concatenate.cu + src/strings/combine/concatenate_list_elements.cu src/strings/combine/join.cu src/strings/contains.cu src/strings/convert/convert_booleans.cu diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp index 49f824b3805..113b6d64f9a 100644 --- a/cpp/include/cudf/strings/combine.hpp +++ b/cpp/include/cudf/strings/combine.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -29,47 +30,6 @@ namespace strings { * @brief Strings APIs for concatenate and join */ -/** - * @brief Row-wise concatenates the given list of strings columns and - * returns a single strings column result. - * - * Each new string is created by concatenating the strings from the same - * row delimited by the separator provided. - * - * Any row with a null entry will result in the corresponding output - * row to be null entry unless a narep string is specified to be used - * in its place. - * - * The number of strings in the columns provided must be the same. - * - * @code{.pseudo} - * Example: - * s1 = ['aa', null, '', 'aa'] - * s2 = ['', 'bb', 'bb', null] - * r1 = concatenate([s1,s2]) - * r1 is ['aa', null, 'bb', null] - * r2 = concatenate([s1,s2],':','_') - * r2 is ['aa:', '_:bb', ':bb', 'aa:_'] - * @endcode - * - * @throw cudf::logic_error if input columns are not all strings columns. - * @throw cudf::logic_error if separator is not valid. - * - * @param strings_columns List of string columns to concatenate. - * @param separator String that should inserted between each string from each row. - * Default is an empty string. - * @param narep String that should be used in place of any null strings - * found in any column. Default of invalid-scalar means any null entry in any column will - * produces a null result for that row. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column with concatenated results. - */ -std::unique_ptr concatenate( - table_view const& strings_columns, - string_scalar const& separator = string_scalar(""), - string_scalar const& narep = string_scalar("", false), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - /** * @brief Concatenates all strings in the column into one new string delimited * by an optional separator string. @@ -158,6 +118,143 @@ std::unique_ptr concatenate( string_scalar const& col_narep = string_scalar("", false), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @addtogroup strings_combine + * @{ + * @file strings/combine.hpp + * @brief Strings APIs for concatenate and join + */ + +/** + * @brief Row-wise concatenates the given list of strings columns and + * returns a single strings column result. + * + * Each new string is created by concatenating the strings from the same + * row delimited by the separator provided. + * + * Any row with a null entry will result in the corresponding output + * row to be null entry unless a narep string is specified to be used + * in its place. + * + * The number of strings in the columns provided must be the same. + * + * @code{.pseudo} + * Example: + * s1 = ['aa', null, '', 'aa'] + * s2 = ['', 'bb', 'bb', null] + * r1 = concatenate([s1,s2]) + * r1 is ['aa', null, 'bb', null] + * r2 = concatenate([s1,s2],':','_') + * r2 is ['aa:', '_:bb', ':bb', 'aa:_'] + * @endcode + * + * @throw cudf::logic_error if input columns are not all strings columns. + * @throw cudf::logic_error if separator is not valid. + * + * @param strings_columns List of string columns to concatenate. + * @param separator String that should inserted between each string from each row. + * Default is an empty string. + * @param narep String that should be used in place of any null strings + * found in any column. Default of invalid-scalar means any null entry in any column will + * produces a null result for that row. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New column with concatenated results. + */ +std::unique_ptr concatenate( + table_view const& strings_columns, + string_scalar const& separator = string_scalar(""), + string_scalar const& narep = string_scalar("", false), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings + * within each row and returns a single strings column result. + * + * Each new string is created by concatenating the strings from the same row (same list element) + * delimited by the row separator provided in the `separators` strings column. + * + * A null list row will always result in a null string in the output row. Any non-null list row + * having a null element will result in the corresponding output row to be null unless a valid + * `string_narep` scalar is provided to be used in its place. Any null row in the `separators` + * column will also result in a null output row unless a valid `separator_narep` scalar is provided + * to be used in place of the null separators. + * + * @code{.pseudo} + * Example: + * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff', 'gg'} ] + * sep = ['::', '%%', '!', '*', null] + * + * r1 = concatenate(s, sep) + * r1 is ['aa::bb::cc', null, '!dd', null, null] + * + * r2 = concatenate(s, sep, ':', '_') + * r2 is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] + * @endcode + * + * @throw cudf::logic_error if input column is not lists of strings column. + * @throw cudf::logic_error if the number of rows from `separators` and `lists_strings_column` do + * not match + * + * @param lists_strings_column Column containing lists of strings to concatenate + * @param separators Strings column that provides separators for concatenation + * @param separator_narep String that should be used to replace null separator, default is an + * invalid-scalar denoting that rows containing null separator will result in null string in the + * corresponding output rows + * @param string_narep String that should be used to replace null strings in any + * non-null list row, default is an invalid-scalar denoting that list rows containing null strings + * will result in null string in the corresponding output rows + * @param mr Device memory resource used to allocate the returned column's + * device memory + * @return New strings column with concatenated results + */ +std::unique_ptr concatenate_list_elements( + const lists_column_view& lists_strings_column, + const strings_column_view& separators, + string_scalar const& separator_narep = string_scalar("", false), + string_scalar const& string_narep = string_scalar("", false), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings + * within each row and returns a single strings column result. + * + * Each new string is created by concatenating the strings from the same row (same list element) + * delimited by the separator provided. + * + * A null list row will always result in a null string in the output row. Any non-null list row + * having a null elenent will result in the corresponding output row to be null unless a narep + * string is specified to be used in its place. + * + * @code{.pseudo} + * Example: + * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff'} ] + * + * r1 = concatenate(s) + * r1 is ['aabbcc', null, 'dd', null, 'ff'] + * + * r2 = concatenate(s, ':', '_') + * r2 is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] + * @endcode + * + * @throw cudf::logic_error if input column is not lists of strings column. + * @throw cudf::logic_error if separator is not valid. + * + * @param lists_strings_column Column containing lists of strings to concatenate + * @param separator String that should inserted between strings of each list row, + * default is an empty string + * @param narep String that should be used to replace null strings in any non-null + * list row, default is an invalid-scalar denoting that list rows containing null strings will + * result in null string in the corresponding output rows + * @param mr Device memory resource used to allocate the returned column's + * device memory + * @return New strings column with concatenated results + */ +std::unique_ptr concatenate_list_elements( + const lists_column_view& lists_strings_column, + string_scalar const& separator = string_scalar(""), + string_scalar const& narep = string_scalar("", false), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of doxygen group } // namespace strings } // namespace cudf diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 600633f0ed8..b2a5c249485 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -367,9 +367,19 @@ table_with_metadata reader::impl::read(avro_reader_options const &options, } if (_metadata->total_data_size > 0) { - const auto buffer = - _source->host_read(_metadata->block_list[0].offset, _metadata->total_data_size); - rmm::device_buffer block_data(buffer->data(), buffer->size(), stream); + rmm::device_buffer block_data; + if (_source->is_device_read_preferred(_metadata->total_data_size)) { + block_data = rmm::device_buffer{_metadata->total_data_size, stream}; + auto read_bytes = _source->device_read(_metadata->block_list[0].offset, + _metadata->total_data_size, + static_cast(block_data.data()), + stream); + block_data.resize(read_bytes); + } else { + const auto buffer = + _source->host_read(_metadata->block_list[0].offset, _metadata->total_data_size); + block_data = rmm::device_buffer{buffer->data(), buffer->size(), stream}; + } if (_metadata->codec != "" && _metadata->codec != "null") { auto decomp_block_data = decompress_data(block_data, stream); diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu index efb4ad3eb64..ec3cc85dc1a 100644 --- a/cpp/src/io/orc/reader_impl.cu +++ b/cpp/src/io/orc/reader_impl.cu @@ -504,10 +504,16 @@ table_with_metadata reader::impl::read(size_type skip_rows, len += stream_info[stream_count].length; stream_count++; } - const auto buffer = _source->host_read(offset, len); - CUDA_TRY( - cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value())); - stream.synchronize(); + if (_source->is_device_read_preferred(len)) { + CUDF_EXPECTS(_source->device_read(offset, len, d_dst, stream) == len, + "Unexpected discrepancy in bytes read."); + } else { + const auto buffer = _source->host_read(offset, len); + CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read."); + CUDA_TRY( + cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyHostToDevice, stream.value())); + stream.synchronize(); + } } // Update chunks to reference streams pointers diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 81a0887bf3a..51b888f0daa 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -929,10 +929,14 @@ void writer::impl::write_data_stream(gpu::StripeStream const &strm_desc, { const auto length = strm_desc.stream_size; (*streams)[enc_stream.ids[strm_desc.stream_type]].length = length; - if (length != 0) { - const auto *stream_in = (compression_kind_ == NONE) - ? enc_stream.data_ptrs[strm_desc.stream_type] - : (compressed_data + strm_desc.bfr_offset); + if (length == 0) { return; } + + const auto *stream_in = (compression_kind_ == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type] + : (compressed_data + strm_desc.bfr_offset); + + if (out_sink_->is_device_write_preferred(length)) { + out_sink_->device_write(stream_in, length, stream); + } else { CUDA_TRY( cudaMemcpyAsync(stream_out, stream_in, length, cudaMemcpyDeviceToHost, stream.value())); stream.synchronize(); @@ -1105,11 +1109,13 @@ void writer::impl::write(table_view const &table) size_t num_compressed_blocks = 0; auto stream_output = [&]() { size_t max_stream_size = 0; + bool all_device_write = true; for (size_t stripe_id = 0; stripe_id < stripe_bounds.size(); stripe_id++) { for (size_t i = 0; i < num_data_streams; i++) { // TODO range for (at least) gpu::StripeStream *ss = &strm_descs[stripe_id][i]; - size_t stream_size = ss->stream_size; + if (!out_sink_->is_device_write_preferred(ss->stream_size)) { all_device_write = false; } + size_t stream_size = ss->stream_size; if (compression_kind_ != NONE) { ss->first_block = num_compressed_blocks; ss->bfr_offset = compressed_bfr_size; @@ -1124,12 +1130,16 @@ void writer::impl::write(table_view const &table) } } - return pinned_buffer{[](size_t size) { - uint8_t *ptr = nullptr; - CUDA_TRY(cudaMallocHost(&ptr, size)); - return ptr; - }(max_stream_size), - cudaFreeHost}; + if (all_device_write) { + return pinned_buffer{nullptr, cudaFreeHost}; + } else { + return pinned_buffer{[](size_t size) { + uint8_t *ptr = nullptr; + CUDA_TRY(cudaMallocHost(&ptr, size)); + return ptr; + }(max_stream_size), + cudaFreeHost}; + } }(); // Compress the data streams diff --git a/cpp/src/strings/combine/concatenate_list_elements.cu b/cpp/src/strings/combine/concatenate_list_elements.cu new file mode 100644 index 00000000000..32f44359799 --- /dev/null +++ b/cpp/src/strings/combine/concatenate_list_elements.cu @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf { +namespace strings { +namespace detail { + +namespace { +/** + * @brief Compute string sizes, string validities, and concatenate strings functor. + * + * This functor is executed twice. In the first pass, the sizes and validities of the output strings + * will be computed. In the second pass, this will concatenate the strings within each list element + * of the given lists column and apply the separator. The null-replacement string scalar + * `string_narep_dv` (if valid) is used in place of any null string. + * + * @tparam Functor The functor which can check for validity of the input list at a given list index + * as well as access to the separator corresponding to the list index. + */ +template +struct compute_size_and_concatenate_fn { + Functor const func; + column_device_view const lists_dv; + offset_type const* const list_offsets; + column_device_view const strings_dv; + string_scalar_device_view const string_narep_dv; + + offset_type* d_offsets{nullptr}; + + // If d_chars == nullptr: only compute sizes and validities of the output strings. + // If d_chars != nullptr: only concatenate strings. + char* d_chars{nullptr}; + + // We need to set `1` or `0` for the validities of the output strings. + int8_t* d_validities{nullptr}; + + __device__ void operator()(size_type const idx) + { + // If this is the second pass, and the row `idx` is known to be a null string + if (d_chars and not d_validities[idx]) { return; } + + if (not d_chars and func.is_null_list(lists_dv, idx)) { + d_offsets[idx] = 0; + d_validities[idx] = false; + return; + } + + auto const separator = func.separator(idx); + auto const separator_size = separator.size_bytes(); + auto size_bytes = size_type{0}; + bool written = false; + char* output_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + + for (size_type str_idx = list_offsets[idx], idx_end = list_offsets[idx + 1]; str_idx < idx_end; + ++str_idx) { + if (not d_chars and (strings_dv.is_null(str_idx) and not string_narep_dv.is_valid())) { + d_offsets[idx] = 0; + d_validities[idx] = false; + return; // early termination: the entire list of strings will result in a null string + } + auto const d_str = strings_dv.is_null(str_idx) ? string_narep_dv.value() + : strings_dv.element(str_idx); + size_bytes += separator_size + d_str.size_bytes(); + if (output_ptr) { + // Separator is inserted only in between strings + if (written) { output_ptr = detail::copy_string(output_ptr, separator); } + output_ptr = detail::copy_string(output_ptr, d_str); + written = true; + } + } + + // Separator is inserted only in between strings + if (not d_chars) { + d_offsets[idx] = static_cast(size_bytes - separator_size); + d_validities[idx] = true; + } + } +}; + +/** + * @brief Functor accompanying with `compute_size_and_concatenate_fn` for computing output string + * sizes, output string validities, and concatenating strings within list elements; used when the + * separator is a string scalar. + */ +struct scalar_separator_fn { + string_scalar_device_view const d_separator; + + __device__ bool is_null_list(column_device_view const& lists_dv, size_type const idx) const + noexcept + { + return lists_dv.is_null(idx); + } + + __device__ string_view separator(size_type const) const noexcept { return d_separator.value(); } +}; + +} // namespace + +std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, + "The input column must be a column of lists of strings"); + CUDF_EXPECTS(separator.is_valid(), "Parameter separator must be a valid string_scalar"); + + auto const num_rows = lists_strings_column.size(); + if (num_rows == 0) { return detail::make_empty_strings_column(stream, mr); } + + // Accessing the child strings column of the lists column must be done by calling `child()` on the + // lists column, not `get_sliced_child()`. This is because calling to `offsets_begin()` on the + // lists column returns a pointer to the offsets of the original lists column, which may not start + // from `0`. + auto const strings_col = strings_column_view(lists_strings_column.child()); + auto const lists_dv_ptr = column_device_view::create(lists_strings_column.parent(), stream); + auto const strings_dv_ptr = column_device_view::create(strings_col.parent(), stream); + auto const sep_dv = get_scalar_device_view(const_cast(separator)); + auto const string_narep_dv = get_scalar_device_view(const_cast(narep)); + + auto const func = scalar_separator_fn{sep_dv}; + auto const comp_fn = compute_size_and_concatenate_fn{ + func, + *lists_dv_ptr, + lists_strings_column.offsets_begin(), + *strings_dv_ptr, + string_narep_dv, + }; + auto [offsets_column, chars_column, null_mask, null_count] = + make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); + + return make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column), + null_count, + std::move(null_mask), + stream, + mr); +} + +namespace { +/** + * @brief Functor accompanying with `compute_size_and_concatenate_fn` for computing output string + * sizes, output string validities, and concatenating strings within list elements; used when the + * separators are given as a strings column. + */ +struct column_separators_fn { + column_device_view const separators_dv; + string_scalar_device_view const sep_narep_dv; + + __device__ bool is_null_list(column_device_view const& lists_dv, size_type const idx) const + noexcept + { + return lists_dv.is_null(idx) or (separators_dv.is_null(idx) and not sep_narep_dv.is_valid()); + } + + __device__ string_view separator(size_type const idx) const noexcept + { + return separators_dv.is_valid(idx) ? separators_dv.element(idx) + : sep_narep_dv.value(); + } +}; + +} // namespace + +std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, + strings_column_view const& separators, + string_scalar const& separator_narep, + string_scalar const& string_narep, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, + "The input column must be a column of lists of strings"); + CUDF_EXPECTS(lists_strings_column.size() == separators.size(), + "Separators column should be the same size as the lists columns"); + + auto const num_rows = lists_strings_column.size(); + if (num_rows == 0) { return detail::make_empty_strings_column(stream, mr); } + + // Accessing the child strings column of the lists column must be done by calling `child()` on the + // lists column, not `get_sliced_child()`. This is because calling to `offsets_begin()` on the + // lists column returns a pointer to the offsets of the original lists column, which may not start + // from `0`. + auto const strings_col = strings_column_view(lists_strings_column.child()); + auto const lists_dv_ptr = column_device_view::create(lists_strings_column.parent(), stream); + auto const strings_dv_ptr = column_device_view::create(strings_col.parent(), stream); + auto const string_narep_dv = get_scalar_device_view(const_cast(string_narep)); + auto const sep_dv_ptr = column_device_view::create(separators.parent(), stream); + auto const sep_narep_dv = get_scalar_device_view(const_cast(separator_narep)); + + auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv}; + auto const comp_fn = compute_size_and_concatenate_fn{ + func, + *lists_dv_ptr, + lists_strings_column.offsets_begin(), + *strings_dv_ptr, + string_narep_dv, + }; + auto [offsets_column, chars_column, null_mask, null_count] = + make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); + + return make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column), + null_count, + std::move(null_mask), + stream, + mr); +} + +} // namespace detail + +std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::concatenate_list_elements( + lists_strings_column, separator, narep, rmm::cuda_stream_default, mr); +} + +std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, + strings_column_view const& separators, + string_scalar const& separator_narep, + string_scalar const& string_narep, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::concatenate_list_elements( + lists_strings_column, separators, separator_narep, string_narep, rmm::cuda_stream_default, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/src/strings/utilities.cuh b/cpp/src/strings/utilities.cuh index a361615f169..82e58dd054b 100644 --- a/cpp/src/strings/utilities.cuh +++ b/cpp/src/strings/utilities.cuh @@ -16,12 +16,15 @@ #pragma once #include +#include #include #include #include #include +#include + #include namespace cudf { @@ -38,7 +41,7 @@ namespace detail { */ __device__ inline char* copy_and_increment(char* buffer, const char* input, size_type bytes) { - memcpy(buffer, input, bytes); + std::memcpy(buffer, input, bytes); return buffer + bytes; } @@ -102,12 +105,93 @@ auto make_strings_children( auto const bytes = cudf::detail::get_value(offsets_view, strings_count, stream); std::unique_ptr chars_column = create_chars_child_column(strings_count, bytes, stream, mr); - size_and_exec_fn.d_chars = chars_column->mutable_view().template data(); - for_each_fn(size_and_exec_fn); + + // Execute the function fn again to fill the chars column. + // Note that if the output chars column has zero size, the function fn should not be called to + // avoid accidentally overwriting the offsets. + if (bytes > 0) { + size_and_exec_fn.d_chars = chars_column->mutable_view().template data(); + for_each_fn(size_and_exec_fn); + } return std::make_pair(std::move(offsets_column), std::move(chars_column)); } +/** + * @brief Creates child offsets, chars columns and null mask, null count of a strings column by + * applying the template function that can be used for computing the output size of each string as + * well as create the output. + * + * @tparam SizeAndExecuteFunction Function must accept an index and return a size. + * It must have members `d_offsets`, `d_chars`, and `d_validities` which are set to memory + * containing the offsets column, chars column and string validities during write. + * + * @param size_and_exec_fn This is called twice. Once for the output size of each string, which is + * written into the `d_offsets` array. After that, `d_chars` is set and this + * is called again to fill in the chars memory. The `d_validities` array may + * be modified to set the value `0` for the corresponding rows that contain + * null string elements. + * @param exec_size Range for executing the function `size_and_exec_fn`. + * @param strings_count Number of strings. + * @param mr Device memory resource used to allocate the returned columns' device memory. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return offsets child column, chars child column, null_mask, and null_count for a strings column. + */ +template +std::tuple, std::unique_ptr, rmm::device_buffer, size_type> +make_strings_children_with_null_mask( + SizeAndExecuteFunction size_and_exec_fn, + size_type exec_size, + size_type strings_count, + rmm::cuda_stream_view stream = rmm::cuda_stream_default, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +{ + auto offsets_column = make_numeric_column( + data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + auto offsets_view = offsets_column->mutable_view(); + auto d_offsets = offsets_view.template data(); + size_and_exec_fn.d_offsets = d_offsets; + + auto validities = rmm::device_uvector(strings_count, stream); + size_and_exec_fn.d_validities = validities.begin(); + + // This is called twice: once for offsets and validities, and once for chars + auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) { + thrust::for_each_n(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + exec_size, + size_and_exec_fn); + }; + + // Compute the string sizes (storing in `d_offsets`) and string validities + for_each_fn(size_and_exec_fn); + + // Compute the offsets from string sizes + thrust::exclusive_scan( + rmm::exec_policy(stream), d_offsets, d_offsets + strings_count + 1, d_offsets); + + // Now build the chars column + auto const bytes = cudf::detail::get_value(offsets_view, strings_count, stream); + auto chars_column = create_chars_child_column(strings_count, bytes, stream, mr); + + // Execute the function fn again to fill the chars column. + // Note that if the output chars column has zero size, the function fn should not be called to + // avoid accidentally overwriting the offsets. + if (bytes > 0) { + size_and_exec_fn.d_chars = chars_column->mutable_view().template data(); + for_each_fn(size_and_exec_fn); + } + + // Finally compute null mask and null count from the validities array + auto [null_mask, null_count] = cudf::detail::valid_if( + validities.begin(), validities.end(), thrust::identity{}, stream, mr); + + return std::make_tuple(std::move(offsets_column), + std::move(chars_column), + null_count > 0 ? std::move(null_mask) : rmm::device_buffer{}, + null_count); +} + /** * @brief Converts a single UTF-8 character into a code-point value that * can be used for lookup in the character flags or the character case tables. diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 9dbd4a881a6..045907b31b1 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -312,18 +312,20 @@ ConfigureTest(DISPATCHER_TEST types/type_dispatcher_test.cu) ################################################################################################### # - strings test ---------------------------------------------------------------------------------- ConfigureTest(STRINGS_TEST - strings/factories_test.cu strings/array_tests.cu strings/attrs_tests.cpp strings/booleans_tests.cpp strings/case_tests.cpp strings/chars_types_tests.cpp - strings/combine_tests.cpp + strings/combine/concatenate_list_elements_tests.cpp + strings/combine/concatenate_tests.cpp + strings/combine/join_strings_tests strings/concatenate_tests.cpp strings/contains_tests.cpp strings/datetime_tests.cpp strings/durations_tests.cpp strings/extract_tests.cpp + strings/factories_test.cu strings/fill_tests.cpp strings/findall_tests.cpp strings/find_tests.cpp diff --git a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp b/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp new file mode 100644 index 00000000000..b6afd588dfb --- /dev/null +++ b/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp @@ -0,0 +1,511 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +struct StringsListsConcatenateTest : public cudf::test::BaseFixture { +}; + +namespace { +using STR_LISTS = cudf::test::lists_column_wrapper; +using STR_COL = cudf::test::strings_column_wrapper; +using INT_LISTS = cudf::test::lists_column_wrapper; + +constexpr bool print_all{false}; + +auto null_at(cudf::size_type idx) +{ + return cudf::detail::make_counting_transform_iterator(0, [idx](auto i) { return i != idx; }); +} + +auto all_nulls() +{ + return cudf::detail::make_counting_transform_iterator(0, [](auto) { return false; }); +} + +auto nulls_from_nullptr(std::vector const& strs) +{ + return thrust::make_transform_iterator(strs.begin(), [](auto ptr) { return ptr != nullptr; }); +} + +} // namespace + +TEST_F(StringsListsConcatenateTest, InvalidInput) +{ + // Invalid list type + { + auto const string_lists = INT_LISTS{{1, 2, 3}, {4, 5, 6}}.release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv), cudf::logic_error); + } + + // Invalid scalar separator + { + auto const string_lists = + STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + EXPECT_THROW( + cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("", false)), + cudf::logic_error); + } + + // Invalid column separators + { + auto const string_lists = + STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + auto const separators = STR_COL{"+++"}.release(); // size doesn't match with lists column size + EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv, separators->view()), + cudf::logic_error); + } +} + +TEST_F(StringsListsConcatenateTest, EmptyInput) +{ + auto const string_lists = STR_LISTS{}.release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + auto const expected = STR_COL{}; + auto results = cudf::strings::concatenate_list_elements(string_lv); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + + auto const separators = STR_COL{}.release(); + results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); +} + +TEST_F(StringsListsConcatenateTest, ZeroSizeStringsInput) +{ + auto const string_lists = + STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + auto const expected = STR_COL{"", "", ""}; + + auto results = cudf::strings::concatenate_list_elements(string_lv); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + + auto const separators = STR_COL{"", "", ""}.release(); + results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); +} + +TEST_F(StringsListsConcatenateTest, AllNullsStringsInput) +{ + auto const string_lists = STR_LISTS{ + STR_LISTS{{""}, all_nulls()}, + STR_LISTS{{"", "", ""}, all_nulls()}, + STR_LISTS{{"", ""}, + all_nulls()}}.release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + auto const expected = STR_COL{{"", "", ""}, all_nulls()}; + + auto results = cudf::strings::concatenate_list_elements(string_lv); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + + auto const separators = STR_COL{{"", "", ""}, all_nulls()}.release(); + results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); +} + +TEST_F(StringsListsConcatenateTest, ScalarSeparator) +{ + auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, + STR_LISTS{}, /*NULL*/ + STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, + STR_LISTS{"zzz", "xxxxx"}}, + null_at(1)} + .release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + + // No null replacement + { + auto const results = + cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // With null replacement + { + auto const results = cudf::strings::concatenate_list_elements( + string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); + std::vector h_expected{ + "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } +} + +TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) +{ + auto const string_lists = STR_LISTS{ + {STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, + STR_LISTS{}, /*NULL*/ + STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, + STR_LISTS{"zzz", "xxxxx"}, + STR_LISTS{"11111", "11111", "11111", "11111", "11111"}, /*NULL*/ + STR_LISTS{{"abcdef", "012345", "" /*NULL*/, "xxx000"}, null_at(2)}, + STR_LISTS{{"xyz" /*NULL*/, "11111", "00000"}, null_at(0)}, + STR_LISTS{"0a0b0c", "5x5y5z"}, + STR_LISTS{"xxx"}, /*NULL*/ + STR_LISTS{"ééé", "12345abcdef"}, + STR_LISTS{"aaaééébbbéééccc", "12345"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return i != 1 && i != 4 && i != 8; + })}.release(); + + // Sliced the entire lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); + auto const results = + cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{nullptr, + nullptr, + nullptr, + "zzz+++xxxxx", + nullptr, + nullptr, + nullptr, + "0a0b0c+++5x5y5z", + nullptr, + "ééé+++12345abcdef", + "aaaééébbbéééccc+++12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the entire lists column, with null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); + std::vector h_expected{"a+++___+++ccc", + nullptr, + "___+++efgh+++ijk", + "zzz+++xxxxx", + nullptr, + "abcdef+++012345+++___+++xxx000", + "___+++11111+++00000", + "0a0b0c+++5x5y5z", + nullptr, + "ééé+++12345abcdef", + "aaaééébbbéééccc+++12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the first half of the lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); + auto const results = + cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the first half of the lists column, with null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); + std::vector h_expected{ + "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the second half of the lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); + auto const results = + cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{ + nullptr, nullptr, "0a0b0c+++5x5y5z", nullptr, "ééé+++12345abcdef", "aaaééébbbéééccc+++12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the second half of the lists column, with null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); + std::vector h_expected{"abcdef+++012345+++___+++xxx000", + "___+++11111+++00000", + "0a0b0c+++5x5y5z", + nullptr, + "ééé+++12345abcdef", + "aaaééébbbéééccc+++12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the middle part of the lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); + auto const results = + cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{ + "zzz+++xxxxx", nullptr, nullptr, nullptr, "0a0b0c+++5x5y5z"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the middle part of the lists column, with null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); + std::vector h_expected{"zzz+++xxxxx", + nullptr, + "abcdef+++012345+++___+++xxx000", + "___+++11111+++00000", + "0a0b0c+++5x5y5z"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } +} + +TEST_F(StringsListsConcatenateTest, ColumnSeparators) +{ + auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, + STR_LISTS{}, /*NULL*/ + STR_LISTS{"0a0b0c", "xyzééé"}, + STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, + STR_LISTS{{"ééé" /*NULL*/, "ááá", "ííí"}, null_at(0)}, + STR_LISTS{"zzz", "xxxxx"}}, + null_at(1)} + .release(); + auto const string_lv = cudf::lists_column_view(string_lists->view()); + auto const separators = STR_COL{ + {"+++", "***", "!!!" /*NULL*/, "$$$" /*NULL*/, "%%%", "^^^"}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return i != 2 && i != 3; + })}.release(); + + // No null replacement + { + auto const results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + std::vector h_expected{nullptr, nullptr, nullptr, nullptr, nullptr, "zzz^^^xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // With null replacement for separators + { + auto const results = cudf::strings::concatenate_list_elements( + string_lv, separators->view(), cudf::string_scalar("|||")); + std::vector h_expected{ + nullptr, nullptr, "0a0b0c|||xyzééé", nullptr, nullptr, "zzz^^^xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // With null replacement for strings + { + auto const results = cudf::strings::concatenate_list_elements( + string_lv, separators->view(), cudf::string_scalar("", false), cudf::string_scalar("XXXXX")); + std::vector h_expected{ + "a+++XXXXX+++ccc", nullptr, nullptr, nullptr, "XXXXX%%%ááá%%%ííí", "zzz^^^xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // With null replacement for both separators and strings + { + auto const results = cudf::strings::concatenate_list_elements( + string_lv, separators->view(), cudf::string_scalar("|||"), cudf::string_scalar("XXXXX")); + std::vector h_expected{"a+++XXXXX+++ccc", + nullptr, + "0a0b0c|||xyzééé", + "XXXXX|||efgh|||ijk", + "XXXXX%%%ááá%%%ííí", + "zzz^^^xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } +} + +TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) +{ + auto const string_lists = STR_LISTS{ + {STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, + STR_LISTS{}, /*NULL*/ + STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, + STR_LISTS{"zzz", "xxxxx"}, + STR_LISTS{"11111", "11111", "11111", "11111", "11111"}, /*NULL*/ + STR_LISTS{{"abcdef", "012345", "" /*NULL*/, "xxx000"}, null_at(2)}, + STR_LISTS{{"xyz" /*NULL*/, "11111", "00000"}, null_at(0)}, + STR_LISTS{"0a0b0c", "5x5y5z"}, + STR_LISTS{"xxx"}, /*NULL*/ + STR_LISTS{"ééé", "12345abcdef"}, + STR_LISTS{"aaaééébbbéééccc", "12345"}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return i != 1 && i != 4 && i != 8; + })}.release(); + auto const separators = STR_COL{ + {"+++", "***", "!!!" /*NULL*/, "$$$" /*NULL*/, "%%%", "^^^", "~!~", "###", "&&&", "-+-", "=+="}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return i != 2 && i != 3; + })}.release(); + + // Sliced the entire lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); + auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + std::vector h_expected{nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + "0a0b0c###5x5y5z", + nullptr, + "ééé-+-12345abcdef", + "aaaééébbbéééccc=+=12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the entire lists column, with null replacements + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); + std::vector h_expected{"a+++___+++ccc", + nullptr, + "___|||efgh|||ijk", + "zzz|||xxxxx", + nullptr, + "abcdef^^^012345^^^___^^^xxx000", + "___~!~11111~!~00000", + "0a0b0c###5x5y5z", + nullptr, + "ééé-+-12345abcdef", + "aaaééébbbéééccc=+=12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the first half of the lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); + auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + std::vector h_expected{nullptr, nullptr, nullptr, nullptr}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the first half of the lists column, with null replacements + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); + std::vector h_expected{ + "a+++___+++ccc", nullptr, "___|||efgh|||ijk", "zzz|||xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the second half of the lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); + auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + std::vector h_expected{ + nullptr, nullptr, "0a0b0c###5x5y5z", nullptr, "ééé-+-12345abcdef", "aaaééébbbéééccc=+=12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the second half of the lists column, with null replacements + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); + std::vector h_expected{"abcdef^^^012345^^^___^^^xxx000", + "___~!~11111~!~00000", + "0a0b0c###5x5y5z", + nullptr, + "ééé-+-12345abcdef", + "aaaééébbbéééccc=+=12345"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the middle part of the lists column, no null replacement + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); + auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + std::vector h_expected{nullptr, nullptr, nullptr, nullptr, "0a0b0c###5x5y5z"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Sliced the middle part of the lists column, with null replacements + { + auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); + auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); + auto const results = cudf::strings::concatenate_list_elements( + string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); + std::vector h_expected{"zzz|||xxxxx", + nullptr, + "abcdef^^^012345^^^___^^^xxx000", + "___~!~11111~!~00000", + "0a0b0c###5x5y5z"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } +} diff --git a/cpp/tests/strings/combine_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp similarity index 89% rename from cpp/tests/strings/combine_tests.cpp rename to cpp/tests/strings/combine/concatenate_tests.cpp index 2ca0562064d..c1c390e8a82 100644 --- a/cpp/tests/strings/combine_tests.cpp +++ b/cpp/tests/strings/combine/concatenate_tests.cpp @@ -28,8 +28,6 @@ #include -#include - struct StringsCombineTest : public cudf::test::BaseFixture { }; @@ -109,64 +107,6 @@ TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns) cudf::test::expect_strings_empty(results->view()); } -TEST_F(StringsCombineTest, Join) -{ - std::vector h_strings{"eee", "bb", nullptr, "zzzz", "", "aaa", "ééé"}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - auto view1 = cudf::strings_column_view(strings); - - { - auto results = cudf::strings::join_strings(view1); - - cudf::test::strings_column_wrapper expected{"eeebbzzzzaaaééé"}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - } - { - auto results = cudf::strings::join_strings(view1, cudf::string_scalar("+")); - - cudf::test::strings_column_wrapper expected{"eee+bb+zzzz++aaa+ééé"}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - } - { - auto results = - cudf::strings::join_strings(view1, cudf::string_scalar("+"), cudf::string_scalar("___")); - - cudf::test::strings_column_wrapper expected{"eee+bb+___+zzzz++aaa+ééé"}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - } -} - -TEST_F(StringsCombineTest, JoinZeroSizeStringsColumn) -{ - cudf::column_view zero_size_strings_column( - cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto strings_view = cudf::strings_column_view(zero_size_strings_column); - auto results = cudf::strings::join_strings(strings_view); - cudf::test::expect_strings_empty(results->view()); -} - -TEST_F(StringsCombineTest, JoinAllNullStringsColumn) -{ - cudf::test::strings_column_wrapper strings({"", "", ""}, {0, 0, 0}); - - auto results = cudf::strings::join_strings(cudf::strings_column_view(strings)); - cudf::test::strings_column_wrapper expected1({""}, {0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); - - results = cudf::strings::join_strings( - cudf::strings_column_view(strings), cudf::string_scalar(""), cudf::string_scalar("3")); - cudf::test::strings_column_wrapper expected2({"333"}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); - - results = cudf::strings::join_strings( - cudf::strings_column_view(strings), cudf::string_scalar("-"), cudf::string_scalar("*")); - cudf::test::strings_column_wrapper expected3({"*-*-*"}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected3); -} - struct StringsConcatenateWithColSeparatorTest : public cudf::test::BaseFixture { }; diff --git a/cpp/tests/strings/combine/join_strings_tests.cpp b/cpp/tests/strings/combine/join_strings_tests.cpp new file mode 100644 index 00000000000..552cd5b0f95 --- /dev/null +++ b/cpp/tests/strings/combine/join_strings_tests.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +struct JoinStringsTest : public cudf::test::BaseFixture { +}; + +TEST_F(JoinStringsTest, Join) +{ + std::vector h_strings{"eee", "bb", nullptr, "zzzz", "", "aaa", "ééé"}; + cudf::test::strings_column_wrapper strings( + h_strings.begin(), + h_strings.end(), + thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + auto view1 = cudf::strings_column_view(strings); + + { + auto results = cudf::strings::join_strings(view1); + + cudf::test::strings_column_wrapper expected{"eeebbzzzzaaaééé"}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + { + auto results = cudf::strings::join_strings(view1, cudf::string_scalar("+")); + + cudf::test::strings_column_wrapper expected{"eee+bb+zzzz++aaa+ééé"}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + { + auto results = + cudf::strings::join_strings(view1, cudf::string_scalar("+"), cudf::string_scalar("___")); + + cudf::test::strings_column_wrapper expected{"eee+bb+___+zzzz++aaa+ééé"}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } +} + +TEST_F(JoinStringsTest, JoinZeroSizeStringsColumn) +{ + cudf::column_view zero_size_strings_column( + cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); + auto strings_view = cudf::strings_column_view(zero_size_strings_column); + auto results = cudf::strings::join_strings(strings_view); + cudf::test::expect_strings_empty(results->view()); +} + +TEST_F(JoinStringsTest, JoinAllNullStringsColumn) +{ + cudf::test::strings_column_wrapper strings({"", "", ""}, {0, 0, 0}); + + auto results = cudf::strings::join_strings(cudf::strings_column_view(strings)); + cudf::test::strings_column_wrapper expected1({""}, {0}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1); + + results = cudf::strings::join_strings( + cudf::strings_column_view(strings), cudf::string_scalar(""), cudf::string_scalar("3")); + cudf::test::strings_column_wrapper expected2({"333"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); + + results = cudf::strings::join_strings( + cudf::strings_column_view(strings), cudf::string_scalar("-"), cudf::string_scalar("*")); + cudf::test::strings_column_wrapper expected3({"*-*-*"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected3); +} diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 2d438d37b3e..d88e5ee1708 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1287,7 +1287,9 @@ def deserialize(cls, header: dict, frames: list) -> ColumnBase: mask = None if "mask" in header: mask = Buffer.deserialize(header["mask"], [frames[1]]) - return build_column(data=data, dtype=dtype, mask=mask) + return build_column( + data=data, dtype=dtype, mask=mask, size=header.get("size", None) + ) def binary_operator( self, op: builtins.str, other: BinaryOperand, reflect: bool = False diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 9f54a16af22..e8ea57774bb 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -1,20 +1,20 @@ # Copyright (c) 2021, NVIDIA CORPORATION. from decimal import Decimal -from typing import cast, Any, Sequence, Union +from numbers import Number +from typing import Any, Sequence, Tuple, Union, cast import cupy as cp import numpy as np import pyarrow as pa from pandas.api.types import is_integer_dtype -from numbers import Number import cudf from cudf import _lib as libcudf +from cudf._lib.quantiles import quantile as cpp_quantile from cudf._lib.strings.convert.convert_fixed_point import ( from_decimal as cpp_from_decimal, ) -from cudf._lib.quantiles import quantile as cpp_quantile from cudf._typing import Dtype from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column @@ -229,6 +229,18 @@ def fillna( ) return self._copy_type_metadata(result) + def serialize(self) -> Tuple[dict, list]: + header, frames = super().serialize() + header["dtype"] = self.dtype.serialize() + header["size"] = self.size + return header, frames + + @classmethod + def deserialize(cls, header: dict, frames: list) -> ColumnBase: + dtype = cudf.Decimal64Dtype.deserialize(*header["dtype"]) + header["dtype"] = dtype + return super().deserialize(header, frames) + def _binop_scale(l_dtype, r_dtype, op): # This should at some point be hooked up to libcudf's diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 0c436cf36e7..7db8ba15caa 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -2,7 +2,7 @@ import decimal import pickle -from typing import Any, Optional +from typing import Any, Optional, Tuple import numpy as np import pandas as pd @@ -268,6 +268,10 @@ def __init__(self, precision, scale=0): self._validate(precision, scale) self._typ = pa.decimal128(precision, scale) + @property + def str(self): + return f"decimal64({self.precision}, {self.scale})" + @property def precision(self): return self._typ.precision @@ -325,6 +329,13 @@ def _from_decimal(cls, decimal): precision = max(len(metadata.digits), -metadata.exponent) return cls(precision, -metadata.exponent) + def serialize(self) -> Tuple[dict, list]: + return {"precision": self.precision, "scale": self.scale}, [] + + @classmethod + def deserialize(cls, header: dict, frames: list): + return cls(header["precision"], header["scale"]) + class IntervalDtype(StructDtype): name = "interval" diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index f6502c4c1fd..111f973a78b 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -13,6 +13,7 @@ FLOAT_TYPES, INTEGER_TYPES, NUMERIC_TYPES, + _decimal_series, assert_eq, ) @@ -204,12 +205,6 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype): assert_eq(got.dtype, expected.dtype) -def _decimal_series(input, dtype): - return cudf.Series( - [x if x is None else Decimal(x) for x in input], dtype=dtype, - ) - - @pytest.mark.parametrize( "args", [ diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py index 0e9c61b634d..d172033d30f 100644 --- a/python/cudf/cudf/tests/test_serialize.py +++ b/python/cudf/cudf/tests/test_serialize.py @@ -9,7 +9,7 @@ import cudf from cudf.tests import utils -from cudf.tests.utils import assert_eq +from cudf.tests.utils import _decimal_series, assert_eq @pytest.mark.parametrize( @@ -289,6 +289,44 @@ def test_serialize_list_columns(data): assert_eq(recreated, df) +@pytest.mark.parametrize( + "data", + [ + { + "a": _decimal_series( + ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0) + ) + }, + { + "a": _decimal_series( + ["1", "2", "3"], dtype=cudf.Decimal64Dtype(1, 0) + ), + "b": _decimal_series( + ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1) + ), + "c": _decimal_series( + ["10.1", "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1) + ), + }, + { + "a": _decimal_series( + ["1", None, "3"], dtype=cudf.Decimal64Dtype(1, 0) + ), + "b": _decimal_series( + ["1.0", "2.0", None], dtype=cudf.Decimal64Dtype(2, 1) + ), + "c": _decimal_series( + [None, "20.2", "30.3"], dtype=cudf.Decimal64Dtype(3, 1) + ), + }, + ], +) +def test_serialize_decimal_columns(data): + df = cudf.DataFrame(data) + recreated = df.__class__.deserialize(*df.serialize()) + assert_eq(recreated, df) + + def test_deserialize_cudf_0_16(datadir): fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_0.16.pkl" diff --git a/python/cudf/cudf/tests/utils.py b/python/cudf/cudf/tests/utils.py index b0427572ab6..672e83e6f64 100644 --- a/python/cudf/cudf/tests/utils.py +++ b/python/cudf/cudf/tests/utils.py @@ -3,6 +3,7 @@ import re from collections.abc import Mapping, Sequence from contextlib import contextmanager +from decimal import Decimal import cupy import numpy as np @@ -296,6 +297,12 @@ def gen_rand_series(dtype, size, **kwargs): return cudf.Series(values) +def _decimal_series(input, dtype): + return cudf.Series( + [x if x is None else Decimal(x) for x in input], dtype=dtype, + ) + + @contextmanager def does_not_raise(): yield