diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index af6f60b031d..aa3b4406320 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -333,8 +333,8 @@ add_library(cudf src/strings/char_types/char_cases.cu src/strings/char_types/char_types.cu src/strings/combine/concatenate.cu - src/strings/combine/concatenate_list_elements.cu src/strings/combine/join.cu + src/strings/combine/join_list_elements.cu src/strings/contains.cu src/strings/convert/convert_booleans.cu src/strings/convert/convert_datetime.cu diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp index 6887ef0e670..360efe15303 100644 --- a/cpp/include/cudf/strings/combine.hpp +++ b/cpp/include/cudf/strings/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,12 +30,21 @@ namespace strings { * @brief Strings APIs for concatenate and join */ +/** + * @brief Setting for specifying how separators are added with + * null strings elements. + */ +enum class separator_on_nulls { + YES, ///< Always add separators between elements + NO ///< Do not add separators if an element is null +}; + /** * @brief Concatenates all strings in the column into one new string delimited * by an optional separator string. * * This returns a column with one string. Any null entries are ignored unless - * the narep parameter specifies a replacement string. + * the @p narep parameter specifies a replacement string. * * @code{.pseudo} * Example: @@ -70,11 +79,9 @@ std::unique_ptr join_strings( * * - If row separator for a given row is null, output column for that row is null, unless * there is a valid @p separator_narep - * - If all column values for a given row is null, output column for that row is null, unless - * there is a valid @p col_narep - * - null column values for a given row are skipped, if the column replacement isn't valid - * - The separator is only applied between two valid column values - * - If valid @p separator_narep and @p col_narep are provided, the output column is always + * - The separator is applied between two output row values if the @p separate_nulls + * is `YES` or only between valid rows if @p separate_nulls is `NO`. + * - If @p separator_narep and @p col_narep are both valid, the output column is always * non nullable * * @code{.pseudo} @@ -83,16 +90,25 @@ std::unique_ptr join_strings( * c1 = [null, 'cc', 'dd', null, null, 'gg'] * c2 = ['bb', '', null, null, null, 'hh'] * sep = ['::', '%%', '^^', '!', '*', null] - * out0 = concatenate([c0, c1, c2], sep) - * out0 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, null] + * out = concatenate({c0, c1, c2}, sep) + * // all rows have at least one null or sep[i]==null + * out is [null, null, null, null, null, null] * * sep_rep = '+' - * out1 = concatenate([c0, c1, c2], sep, sep_rep) - * out1 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, 'ff+gg+hh'] - * - * col_rep = '-' - * out2 = concatenate([c0, c1, c2], sep, invalid_sep_rep, col_rep) - * out2 is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null] + * out = concatenate({c0, c1, c2}, sep, sep_rep) + * // all rows with at least one null output as null + * out is [null, null, null, null, null, 'ff+gg+hh'] + * + * col_narep = '-' + * sep_na = non-valid scalar + * out = concatenate({c0, c1, c2}, sep, sep_na, col_narep) + * // only the null entry in the sep column produces a null row + * out is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null] + * + * col_narep = '' + * out = concatenate({c0, c1, c2}, sep, sep_rep, col_narep, separator_on_nulls:NO) + * // parameter suppresses separator for null rows + * out is ['aa::bb', 'cc%%', '^^dd', 'ee', '', 'ff+gg+hh'] * @endcode * * @throw cudf::logic_error if no input columns are specified - table view is empty @@ -108,6 +124,8 @@ std::unique_ptr join_strings( * @param col_narep String that should be used in place of any null strings * found in any column. Default of invalid-scalar means no null column value replacements. * Default is an invalid string. + * @param separate_nulls If YES, then the separator is included for null rows + * if `col_narep` is valid. * @param mr Resource for allocating device memory. * @return New column with concatenated results. */ @@ -116,15 +134,9 @@ std::unique_ptr concatenate( strings_column_view const& separators, string_scalar const& separator_narep = string_scalar("", false), string_scalar const& col_narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @addtogroup strings_combine - * @{ - * @file strings/combine.hpp - * @brief Strings APIs for concatenate and join - */ - /** * @brief Row-wise concatenates the given list of strings columns and * returns a single strings column result. @@ -136,20 +148,30 @@ std::unique_ptr concatenate( * row to be null entry unless a narep string is specified to be used * in its place. * - * The number of strings in the columns provided must be the same. + * If @p separate_nulls is set to `NO` and @p narep is valid then + * separators are not added to the output between null elements. + * Otherwise, separators are always added if @p narep is valid. + * + * More than one column must be specified in the input @p strings_columns + * table. * * @code{.pseudo} * Example: - * s1 = ['aa', null, '', 'aa'] - * s2 = ['', 'bb', 'bb', null] - * r1 = concatenate([s1,s2]) - * r1 is ['aa', null, 'bb', null] - * r2 = concatenate([s1,s2],':','_') - * r2 is ['aa:', '_:bb', ':bb', 'aa:_'] + * s1 = ['aa', null, '', 'dd'] + * s2 = ['', 'bb', 'cc', null] + * out = concatenate({s1, s2}) + * out is ['aa', null, 'cc', null] + * + * out = concatenate({s1, s2}, ':', '_') + * out is ['aa:', '_:bb', ':cc', 'dd:_'] + * + * out = concatenate({s1, s2}, ':', '', separator_on_nulls::NO) + * out is ['aa:', 'bb', ':cc', 'dd'] * @endcode * * @throw cudf::logic_error if input columns are not all strings columns. * @throw cudf::logic_error if separator is not valid. + * @throw cudf::logic_error if only one column is specified * * @param strings_columns List of string columns to concatenate. * @param separator String that should inserted between each string from each row. @@ -157,6 +179,7 @@ std::unique_ptr concatenate( * @param narep String that should be used in place of any null strings * found in any column. Default of invalid-scalar means any null entry in any column will * produces a null result for that row. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with concatenated results. */ @@ -164,6 +187,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -171,24 +195,30 @@ std::unique_ptr concatenate( * within each row and returns a single strings column result. * * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the row separator provided in the `separators` strings column. + * delimited by the row separator provided in the @p separators strings column. * * A null list row will always result in a null string in the output row. Any non-null list row * having a null element will result in the corresponding output row to be null unless a valid - * `string_narep` scalar is provided to be used in its place. Any null row in the `separators` - * column will also result in a null output row unless a valid `separator_narep` scalar is provided + * @p string_narep scalar is provided to be used in its place. Any null row in the @p separators + * column will also result in a null output row unless a valid @p separator_narep scalar is provided * to be used in place of the null separators. * + * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the + * output between null elements. Otherwise, separators are always added if @p narep is valid. + * * @code{.pseudo} * Example: - * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff', 'gg'} ] + * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff', 'gg'] ] * sep = ['::', '%%', '!', '*', null] * - * r1 = strings::concatenate_list_elements(s, sep) - * r1 is ['aa::bb::cc', null, '!dd', null, null] + * out = join_list_elements(s, sep) + * out is ['aa::bb::cc', null, '!dd', null, null] + * + * out = join_list_elements(s, sep, ':', '_') + * out is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] * - * r2 = strings::concatenate_list_elements(s, sep, ':', '_') - * r2 is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg'] + * out = join_list_elements(s, sep, ':', '', separator_on_nulls::NO) + * out is ['aa::bb::cc', null, '!dd', 'ee', 'ff:gg'] * @endcode * * @throw cudf::logic_error if input column is not lists of strings column. @@ -203,14 +233,16 @@ std::unique_ptr concatenate( * @param string_narep String that should be used to replace null strings in any non-null list row, * default is an invalid-scalar denoting that list rows containing null strings will result * in null string in the corresponding output rows. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with concatenated results. */ -std::unique_ptr concatenate_list_elements( +std::unique_ptr join_list_elements( const lists_column_view& lists_strings_column, const strings_column_view& separators, string_scalar const& separator_narep = string_scalar("", false), string_scalar const& string_narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -218,21 +250,27 @@ std::unique_ptr concatenate_list_elements( * within each row and returns a single strings column result. * * Each new string is created by concatenating the strings from the same row (same list element) - * delimited by the separator provided. + * delimited by the @p separator provided. * * A null list row will always result in a null string in the output row. Any non-null list row - * having a null elenent will result in the corresponding output row to be null unless a narep - * string is specified to be used in its place. + * having a null elenent will result in the corresponding output row to be null unless a + * @p narep string is specified to be used in its place. + * + * If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the + * output between null elements. Otherwise, separators are always added if @p narep is valid. * * @code{.pseudo} * Example: - * s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff'} ] + * s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff'] ] + * + * out = join_list_elements(s) + * out is ['aabbcc', null, 'dd', null, 'ff'] * - * r1 = strings::concatenate_list_elements(s) - * r1 is ['aabbcc', null, 'dd', null, 'ff'] + * out = join_list_elements(s, ':', '_') + * out is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] * - * r2 = strings::concatenate_list_elements(s, ':', '_') - * r2 is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff'] + * out = join_list_elements(s, ':', '', separator_on_nulls::NO) + * out is ['aa:bb:cc', null, ':dd', 'ee', 'ff'] * @endcode * * @throw cudf::logic_error if input column is not lists of strings column. @@ -244,13 +282,15 @@ std::unique_ptr concatenate_list_elements( * @param narep String that should be used to replace null strings in any non-null list row, default * is an invalid-scalar denoting that list rows containing null strings will result in null * string in the corresponding output rows. + * @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column with concatenated results. */ -std::unique_ptr concatenate_list_elements( +std::unique_ptr join_list_elements( const lists_column_view& lists_strings_column, string_scalar const& separator = string_scalar(""), string_scalar const& narep = string_scalar("", false), + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp index 6e25a4dfa38..d6bdf398886 100644 --- a/cpp/include/cudf/strings/detail/combine.hpp +++ b/cpp/include/cudf/strings/detail/combine.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -36,6 +37,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls = separator_on_nulls::YES, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index d2b6be5eead..c8559b7d9db 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -404,11 +405,19 @@ void writer::impl::write(table_view const& table, auto str_table_view = str_table_ptr->view(); // concatenate columns in each row into one big string column - //(using null representation and delimiter): + // (using null representation and delimiter): // std::string delimiter_str{options_.get_inter_column_delimiter()}; - auto str_concat_col = cudf::strings::detail::concatenate( - str_table_view, delimiter_str, options_.get_na_rep(), stream); + auto str_concat_col = [&] { + if (str_table_view.num_columns() > 1) + return cudf::strings::detail::concatenate(str_table_view, + delimiter_str, + options_.get_na_rep(), + strings::separator_on_nulls::YES, + stream); + cudf::string_scalar narep{options_.get_na_rep()}; + return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream); + }(); write_chunked(str_concat_col->view(), metadata, stream); } diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index 5d7b9152ff3..1329ad3113f 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -41,67 +41,93 @@ namespace strings { namespace detail { namespace { -/** - * @brief Concatenate strings functor - * - * This will concatenate the strings from each row of the given table - * and apply the separator. The null-replacement string `d_narep` is - * used in place of any string in a row that contains a null entry. - */ -struct concat_strings_fn { +struct concat_strings_base { table_device_view const d_table; - string_view const d_separator; string_scalar_device_view const d_narep; + separator_on_nulls separate_nulls; offset_type* d_offsets{}; char* d_chars{}; - __device__ void operator()(size_type idx) + /** + * @brief Concatenate each table row to a single output string. + * + * This will concatenate the strings from each row of the given table + * and apply the separator. The null-replacement string `d_narep` is + * used in place of any string in a row that contains a null entry. + * + * @param idx The current row to process + * @param d_separator String to place in between each column's row + */ + __device__ void process_row(size_type idx, string_view const d_separator) { - bool const null_element = - thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { - return col.is_null(idx); - }); - // handle a null row - if (null_element && !d_narep.is_valid()) { + if (!d_narep.is_valid() && + thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { + return col.is_null(idx); + })) { if (!d_chars) d_offsets[idx] = 0; return; } - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - size_type bytes = 0; + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; + offset_type bytes = 0; + bool write_separator = false; + for (auto itr = d_table.begin(); itr < d_table.end(); ++itr) { - auto const d_column = *itr; - auto const d_str = - d_column.is_null(idx) ? d_narep.value() : d_column.element(idx); - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); - bytes += d_str.size_bytes(); - // separator goes only in between elements - if (itr + 1 < d_table.end()) { + auto const d_column = *itr; + bool const null_element = d_column.is_null(idx); + + if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) { if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator); bytes += d_separator.size_bytes(); + write_separator = false; } + + // write out column's row data (or narep if the row is null) + auto const d_str = null_element ? d_narep.value() : d_column.element(idx); + if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); + bytes += d_str.size_bytes(); + + write_separator = + write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } + if (!d_chars) d_offsets[idx] = bytes; } }; +/** + * @brief Single separator concatenate functor + */ +struct concat_strings_fn : concat_strings_base { + string_view const d_separator; + + concat_strings_fn(table_device_view const& d_table, + string_view const& d_separator, + string_scalar_device_view const& d_narep, + separator_on_nulls separate_nulls) + : concat_strings_base{d_table, d_narep, separate_nulls}, d_separator(d_separator) + { + } + + __device__ void operator()(size_type idx) { process_row(idx, d_separator); } +}; + } // namespace std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { auto const num_columns = strings_columns.num_columns(); - CUDF_EXPECTS(num_columns > 0, "At least one column must be specified"); + CUDF_EXPECTS(num_columns > 1, "At least two columns must be specified"); // check all columns are of type string CUDF_EXPECTS(std::all_of(strings_columns.begin(), strings_columns.end(), [](auto c) { return c.type().id() == type_id::STRING; }), "All columns must be of type string"); - if (num_columns == 1) // single strings column returns a copy - return std::make_unique(*(strings_columns.begin()), stream, mr); auto const strings_count = strings_columns.num_rows(); if (strings_count == 0) // empty begets empty return detail::make_empty_strings_column(stream, mr); @@ -112,7 +138,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); - concat_strings_fn fn{*d_table, d_separator, d_narep}; + concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls}; auto children = make_strings_children(fn, strings_count, stream, mr); // create resulting null mask @@ -120,9 +146,9 @@ std::unique_ptr concatenate(table_view const& strings_columns, thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), [d_table = *d_table, d_narep] __device__(size_type idx) { - bool null_element = thrust::any_of( + if (d_narep.is_valid()) return true; + return !thrust::any_of( thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); - return (!null_element || d_narep.is_valid()); }, stream, mr); @@ -145,68 +171,42 @@ namespace { * when a separator row is null `d_separator_narep`. The `d_narep` is * used in place of a null entry in the strings columns. */ -struct multi_separator_concat_fn { - table_device_view const d_table; +struct multi_separator_concat_fn : concat_strings_base { column_device_view const d_separators; string_scalar_device_view const d_separator_narep; - string_scalar_device_view const d_narep; - offset_type* d_offsets{}; - char* d_chars{}; - __device__ void operator()(size_type idx) + multi_separator_concat_fn(table_device_view const& d_table, + column_device_view const& d_separators, + string_scalar_device_view const& d_separator_narep, + string_scalar_device_view const& d_narep, + separator_on_nulls separate_nulls) + : concat_strings_base{d_table, d_narep, separate_nulls}, + d_separators(d_separators), + d_separator_narep(d_separator_narep) { - bool const all_nulls = - thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) { - return col.is_null(idx); - }); + } - if ((d_separators.is_null(idx) && !d_separator_narep.is_valid()) || - (all_nulls && !d_narep.is_valid())) { + __device__ void operator()(size_type idx) + { + if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) { if (!d_chars) d_offsets[idx] = 0; return; } - // point to output location - char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; - offset_type bytes = 0; - - // there is at least one non-null column value auto const d_separator = d_separators.is_valid(idx) ? d_separators.element(idx) : d_separator_narep.value(); - auto const d_null_rep = d_narep.is_valid() ? d_narep.value() : string_view{}; - - // write output entry for this row - bool colval_written = false; // state variable for writing separators - for (auto const d_column : d_table) { - // if the row is null and if there is no replacement, skip it - if (d_column.is_null(idx) && !d_narep.is_valid()) continue; - - // separator in this row is written only after the first output - if (colval_written) { - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_separator); - bytes += d_separator.size_bytes(); - } - - // write out column's row data (or narep if the row is null) - string_view const d_str = - d_column.is_null(idx) ? d_null_rep : d_column.element(idx); - if (d_buffer) d_buffer = detail::copy_string(d_buffer, d_str); - bytes += d_str.size_bytes(); - - // column's string or narep could by empty so we need this flag - // to know we got this far even if no actual bytes were copied - colval_written = true; // use the separator before the next column - } - - if (!d_chars) d_offsets[idx] = bytes; + // base class utility function handles the rest + process_row(idx, d_separator); } }; + } // namespace std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, + separator_on_nulls separate_nulls, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { @@ -234,20 +234,19 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); - multi_separator_concat_fn mscf{*d_table, separator_col_view, separator_rep, col_rep}; + multi_separator_concat_fn mscf{ + *d_table, separator_col_view, separator_rep, col_rep, separate_nulls}; auto children = make_strings_children(mscf, strings_count, stream, mr); // Create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), - [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type ridx) { - if (!separator_col_view.is_valid(ridx) && !separator_rep.is_valid()) return false; - bool all_nulls = - thrust::all_of(thrust::seq, d_table.begin(), d_table.end(), [ridx](auto const& col) { - return col.is_null(ridx); - }); - return all_nulls ? col_rep.is_valid() : true; + [d_table = *d_table, separator_col_view, separator_rep, col_rep] __device__(size_type idx) { + if (!separator_col_view.is_valid(idx) && !separator_rep.is_valid()) return false; + if (col_rep.is_valid()) return true; + return !thrust::any_of( + thrust::seq, d_table.begin(), d_table.end(), [idx](auto col) { return col.is_null(idx); }); }, stream, mr); @@ -268,21 +267,29 @@ std::unique_ptr concatenate(table_view const& strings_columns, std::unique_ptr concatenate(table_view const& strings_columns, string_scalar const& separator, string_scalar const& narep, + separator_on_nulls separate_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate(strings_columns, separator, narep, rmm::cuda_stream_default, mr); + return detail::concatenate( + strings_columns, separator, narep, separate_nulls, rmm::cuda_stream_default, mr); } std::unique_ptr concatenate(table_view const& strings_columns, strings_column_view const& separators, string_scalar const& separator_narep, string_scalar const& col_narep, + separator_on_nulls separate_nulls, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate( - strings_columns, separators, separator_narep, col_narep, rmm::cuda_stream_default, mr); + return detail::concatenate(strings_columns, + separators, + separator_narep, + col_narep, + separate_nulls, + rmm::cuda_stream_default, + mr); } } // namespace strings diff --git a/cpp/src/strings/combine/concatenate_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu similarity index 64% rename from cpp/src/strings/combine/concatenate_list_elements.cu rename to cpp/src/strings/combine/join_list_elements.cu index 1157b8f3fce..7a83097566c 100644 --- a/cpp/src/strings/combine/concatenate_list_elements.cu +++ b/cpp/src/strings/combine/join_list_elements.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,7 @@ struct compute_size_and_concatenate_fn { offset_type const* const list_offsets; column_device_view const strings_dv; string_scalar_device_view const string_narep_dv; + separator_on_nulls const separate_nulls; offset_type* d_offsets{nullptr}; @@ -72,33 +74,38 @@ struct compute_size_and_concatenate_fn { return; } - auto const separator = func.separator(idx); - auto const separator_size = separator.size_bytes(); - auto size_bytes = size_type{0}; - bool written = false; - char* output_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + auto const separator = func.separator(idx); + auto size_bytes = size_type{0}; + char* output_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr; + bool write_separator = false; for (size_type str_idx = list_offsets[idx], idx_end = list_offsets[idx + 1]; str_idx < idx_end; ++str_idx) { - if (not d_chars and (strings_dv.is_null(str_idx) and not string_narep_dv.is_valid())) { + bool null_element = strings_dv.is_null(str_idx); + + if (not d_chars and (null_element and not string_narep_dv.is_valid())) { d_offsets[idx] = 0; d_validities[idx] = false; return; // early termination: the entire list of strings will result in a null string } - auto const d_str = strings_dv.is_null(str_idx) ? string_narep_dv.value() - : strings_dv.element(str_idx); - size_bytes += separator_size + d_str.size_bytes(); - if (output_ptr) { - // Separator is inserted only in between strings - if (written) { output_ptr = detail::copy_string(output_ptr, separator); } - output_ptr = detail::copy_string(output_ptr, d_str); - written = true; + + if (write_separator && (separate_nulls == separator_on_nulls::YES || !null_element)) { + if (output_ptr) output_ptr = detail::copy_string(output_ptr, separator); + size_bytes += separator.size_bytes(); + write_separator = false; } + + auto const d_str = + null_element ? string_narep_dv.value() : strings_dv.element(str_idx); + if (output_ptr) output_ptr = detail::copy_string(output_ptr, d_str); + size_bytes += d_str.size_bytes(); + + write_separator = + write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element; } - // Separator is inserted only in between strings if (not d_chars) { - d_offsets[idx] = static_cast(size_bytes - separator_size); + d_offsets[idx] = size_bytes; d_validities[idx] = true; } } @@ -123,11 +130,12 @@ struct scalar_separator_fn { } // namespace -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - string_scalar const& separator, - string_scalar const& narep, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + separator_on_nulls separate_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, "The input column must be a column of lists of strings"); @@ -146,14 +154,14 @@ std::unique_ptr concatenate_list_elements(lists_column_view const& lists auto const sep_dv = get_scalar_device_view(const_cast(separator)); auto const string_narep_dv = get_scalar_device_view(const_cast(narep)); - auto const func = scalar_separator_fn{sep_dv}; - auto const comp_fn = compute_size_and_concatenate_fn{ - func, - *lists_dv_ptr, - lists_strings_column.offsets_begin(), - *strings_dv_ptr, - string_narep_dv, - }; + auto const func = scalar_separator_fn{sep_dv}; + auto const comp_fn = + compute_size_and_concatenate_fn{func, + *lists_dv_ptr, + lists_strings_column.offsets_begin(), + *strings_dv_ptr, + string_narep_dv, + separate_nulls}; auto [offsets_column, chars_column, null_mask, null_count] = make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); @@ -191,12 +199,13 @@ struct column_separators_fn { } // namespace -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - strings_column_view const& separators, - string_scalar const& separator_narep, - string_scalar const& string_narep, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + strings_column_view const& separators, + string_scalar const& separator_narep, + string_scalar const& string_narep, + separator_on_nulls separate_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING, "The input column must be a column of lists of strings"); @@ -217,14 +226,14 @@ std::unique_ptr concatenate_list_elements(lists_column_view const& lists auto const sep_dv_ptr = column_device_view::create(separators.parent(), stream); auto const sep_narep_dv = get_scalar_device_view(const_cast(separator_narep)); - auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv}; - auto const comp_fn = compute_size_and_concatenate_fn{ - func, - *lists_dv_ptr, - lists_strings_column.offsets_begin(), - *strings_dv_ptr, - string_narep_dv, - }; + auto const func = column_separators_fn{*sep_dv_ptr, sep_narep_dv}; + auto const comp_fn = + compute_size_and_concatenate_fn{func, + *lists_dv_ptr, + lists_strings_column.offsets_begin(), + *strings_dv_ptr, + string_narep_dv, + separate_nulls}; auto [offsets_column, chars_column, null_mask, null_count] = make_strings_children_with_null_mask(comp_fn, num_rows, num_rows, stream, mr); @@ -239,25 +248,32 @@ std::unique_ptr concatenate_list_elements(lists_column_view const& lists } // namespace detail -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - string_scalar const& separator, - string_scalar const& narep, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + string_scalar const& separator, + string_scalar const& narep, + separator_on_nulls separate_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements( - lists_strings_column, separator, narep, rmm::cuda_stream_default, mr); + return detail::join_list_elements( + lists_strings_column, separator, narep, separate_nulls, rmm::cuda_stream_default, mr); } -std::unique_ptr concatenate_list_elements(lists_column_view const& lists_strings_column, - strings_column_view const& separators, - string_scalar const& separator_narep, - string_scalar const& string_narep, - rmm::mr::device_memory_resource* mr) +std::unique_ptr join_list_elements(lists_column_view const& lists_strings_column, + strings_column_view const& separators, + string_scalar const& separator_narep, + string_scalar const& string_narep, + separator_on_nulls separate_nulls, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::concatenate_list_elements( - lists_strings_column, separators, separator_narep, string_narep, rmm::cuda_stream_default, mr); + return detail::join_list_elements(lists_strings_column, + separators, + separator_narep, + string_narep, + separate_nulls, + rmm::cuda_stream_default, + mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index f36ec70479b..bbcfd69a52b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -328,8 +328,8 @@ ConfigureTest(STRINGS_TEST strings/booleans_tests.cpp strings/case_tests.cpp strings/chars_types_tests.cpp - strings/combine/concatenate_list_elements_tests.cpp strings/combine/concatenate_tests.cpp + strings/combine/join_list_elements_tests.cpp strings/combine/join_strings_tests.cpp strings/concatenate_tests.cpp strings/contains_tests.cpp diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp index c1c390e8a82..d91f669e42d 100644 --- a/cpp/tests/strings/combine/concatenate_tests.cpp +++ b/cpp/tests/strings/combine/concatenate_tests.cpp @@ -95,6 +95,58 @@ TEST_F(StringsCombineTest, Concatenate) } } +TEST_F(StringsCombineTest, ConcatenateSkipNulls) +{ + cudf::test::strings_column_wrapper strings1({"eee", "", "", "", "aa", "bbb", "ééé"}, + {1, 0, 0, 1, 1, 1, 1}); + cudf::test::strings_column_wrapper strings2({"xyz", "", "d", "éa", "", "", "f"}, + {1, 0, 1, 1, 1, 0, 1}); + cudf::test::strings_column_wrapper strings3({"q", "", "s", "t", "u", "", "w"}, + {1, 1, 1, 1, 1, 0, 1}); + + cudf::table_view table({strings1, strings2, strings3}); + + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "++", "+d+s", "+éa+t", "aa++u", "bbb++", "ééé+f+w"}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::YES); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "d+s", "+éa+t", "aa++u", "bbb", "ééé+f+w"}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "", "+éa+t", "aa++u", "", "ééé+f+w"}, {1, 0, 0, 1, 1, 0, 1}); + auto results = cudf::strings::concatenate(table, + cudf::string_scalar("+"), + cudf::string_scalar("", false), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } + { + cudf::test::strings_column_wrapper sep_col({"+", "-", ".", "@", "*", "^^", "#"}); + auto results = cudf::strings::concatenate(table, + cudf::strings_column_view(sep_col), + cudf::string_scalar(""), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + + cudf::test::strings_column_wrapper expected( + {"eee+xyz+q", "", "d.s", "@éa@t", "aa**u", "bbb", "ééé#f#w"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + } +} + TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( @@ -107,6 +159,12 @@ TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns) cudf::test::expect_strings_empty(results->view()); } +TEST_F(StringsCombineTest, SingleColumnErrorCheck) +{ + cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); + EXPECT_THROW(cudf::strings::concatenate(cudf::table_view{{col0}}), cudf::logic_error); +} + struct StringsConcatenateWithColSeparatorTest : public cudf::test::BaseFixture { }; @@ -157,7 +215,6 @@ TEST_F(StringsConcatenateWithColSeparatorTest, SingleColumnEmptyAndNullStringsNo auto exp_results = cudf::test::strings_column_wrapper({"", "", "", ""}, {false, true, false, false}); - auto results = cudf::strings::concatenate(cudf::table_view{{col0}}, cudf::strings_column_view(sep_col)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); @@ -295,12 +352,20 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnEmptyAndNullStringsNoR auto sep_col = cudf::test::strings_column_wrapper( {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false}); - auto exp_results = cudf::test::strings_column_wrapper( - {"", "", "", "", "", "", "", ""}, {false, false, true, false, true, false, true, false}); - + auto exp_results1 = cudf::test::strings_column_wrapper( + {"", "", "", "", "", "", "", ""}, {false, false, true, false, false, false, false, false}); auto results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"", "", "", "", "", "", "", ""}, {true, false, true, false, true, false, true, false}); + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + cudf::string_scalar("", false), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacements) @@ -315,13 +380,23 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixNoReplacement {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"}, {true, true, false, true, false, true, false, true, true, true, true, true}); - auto exp_results = cudf::test::strings_column_wrapper( - {"eeexyzfoo", "~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""}, - {true, true, false, true, false, true, false, true, true, false, false, false}); + auto exp_results1 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "", "", "", "", "", "", "", "", "", ""}, + {true, true, false, false, false, false, false, false, false, false, false, false}); auto results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col)); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "", "éééf", "", "", "", "valid", "doo", "", "", ""}, + {true, true, false, true, false, true, false, true, true, true, true, true}); + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + cudf::string_scalar("", false), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorReplacement) @@ -335,26 +410,26 @@ TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixSeparatorRepl auto sep_col = cudf::test::strings_column_wrapper( {"", "~~~", "", "@", "", "", "", "^^^^", "", "--", "*****", "######"}, {true, true, false, true, false, true, false, true, true, true, true, true}); - auto sep_rep = cudf::string_scalar("!!!!!!!!!!"); + auto sep_rep = cudf::string_scalar("!!!!!!!"); - auto exp_results = cudf::test::strings_column_wrapper( - {"eeexyzfoo", - "~~~", - "!!!!!!!!!!éaff", - "éééf", - "éa", - "", - "éaff", - "valid", - "doo", - "", - "", - ""}, - {true, true, true, true, true, true, true, true, true, false, false, false}); + auto exp_results1 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""}, + {true, true, true, false, false, false, false, false, false, false, false, false}); auto results = cudf::strings::concatenate( cudf::table_view{{col0, col1}}, cudf::strings_column_view(sep_col), sep_rep); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results, true); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results1, true); + + auto exp_results2 = cudf::test::strings_column_wrapper( + {"eeexyzfoo", "~~~", "!!!!!!!éaff", "éééf", "éa", "", "éaff", "valid", "doo", "", "", ""}, + {true, true, true, true, true, true, true, true, true, true, true, true}); + + results = cudf::strings::concatenate(cudf::table_view{{col0, col1}}, + cudf::strings_column_view(sep_col), + sep_rep, + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results2, true); } TEST_F(StringsConcatenateWithColSeparatorTest, MultiColumnStringMixColumnReplacement) diff --git a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp b/cpp/tests/strings/combine/join_list_elements_tests.cpp similarity index 82% rename from cpp/tests/strings/combine/concatenate_list_elements_tests.cpp rename to cpp/tests/strings/combine/join_list_elements_tests.cpp index b6afd588dfb..e2f7c3e36a2 100644 --- a/cpp/tests/strings/combine/concatenate_list_elements_tests.cpp +++ b/cpp/tests/strings/combine/join_list_elements_tests.cpp @@ -58,7 +58,7 @@ TEST_F(StringsListsConcatenateTest, InvalidInput) { auto const string_lists = INT_LISTS{{1, 2, 3}, {4, 5, 6}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); - EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv), cudf::logic_error); + EXPECT_THROW(cudf::strings::join_list_elements(string_lv), cudf::logic_error); } // Invalid scalar separator @@ -66,9 +66,8 @@ TEST_F(StringsListsConcatenateTest, InvalidInput) auto const string_lists = STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); - EXPECT_THROW( - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("", false)), - cudf::logic_error); + EXPECT_THROW(cudf::strings::join_list_elements(string_lv, cudf::string_scalar("", false)), + cudf::logic_error); } // Invalid column separators @@ -77,7 +76,7 @@ TEST_F(StringsListsConcatenateTest, InvalidInput) STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); auto const separators = STR_COL{"+++"}.release(); // size doesn't match with lists column size - EXPECT_THROW(cudf::strings::concatenate_list_elements(string_lv, separators->view()), + EXPECT_THROW(cudf::strings::join_list_elements(string_lv, separators->view()), cudf::logic_error); } } @@ -87,26 +86,26 @@ TEST_F(StringsListsConcatenateTest, EmptyInput) auto const string_lists = STR_LISTS{}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); auto const expected = STR_COL{}; - auto results = cudf::strings::concatenate_list_elements(string_lv); + auto results = cudf::strings::join_list_elements(string_lv); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); auto const separators = STR_COL{}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + results = cudf::strings::join_list_elements(string_lv, separators->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } TEST_F(StringsListsConcatenateTest, ZeroSizeStringsInput) { auto const string_lists = - STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}}.release(); + STR_LISTS{STR_LISTS{""}, STR_LISTS{"", "", ""}, STR_LISTS{"", ""}, STR_LISTS{}}.release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); - auto const expected = STR_COL{"", "", ""}; + auto const expected = STR_COL{"", "", "", ""}; - auto results = cudf::strings::concatenate_list_elements(string_lv); + auto results = cudf::strings::join_list_elements(string_lv); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); - auto const separators = STR_COL{"", "", ""}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + auto const separators = STR_COL{"", "", "", ""}.release(); + results = cudf::strings::join_list_elements(string_lv, separators->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } @@ -120,29 +119,35 @@ TEST_F(StringsListsConcatenateTest, AllNullsStringsInput) auto const string_lv = cudf::lists_column_view(string_lists->view()); auto const expected = STR_COL{{"", "", ""}, all_nulls()}; - auto results = cudf::strings::concatenate_list_elements(string_lv); + auto results = cudf::strings::join_list_elements(string_lv); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); auto const separators = STR_COL{{"", "", ""}, all_nulls()}.release(); - results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + results = cudf::strings::join_list_elements(string_lv, separators->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } +auto null_at(std::initializer_list indices) +{ + return cudf::detail::make_counting_transform_iterator( + 0, [indices](auto i) { return std::find(indices.begin(), indices.end(), i) == indices.end(); }); +} + TEST_F(StringsListsConcatenateTest, ScalarSeparator) { auto const string_lists = STR_LISTS{{STR_LISTS{{"a", "bb" /*NULL*/, "ccc"}, null_at(1)}, STR_LISTS{}, /*NULL*/ STR_LISTS{{"ddd" /*NULL*/, "efgh", "ijk"}, null_at(0)}, - STR_LISTS{"zzz", "xxxxx"}}, + STR_LISTS{"zzz", "xxxxx"}, + STR_LISTS{{"v", "", "", "w"}, null_at({1, 2})}}, null_at(1)} .release(); auto const string_lv = cudf::lists_column_view(string_lists->view()); // No null replacement { - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); - std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); + std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx", nullptr}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); @@ -150,10 +155,22 @@ TEST_F(StringsListsConcatenateTest, ScalarSeparator) // With null replacement { - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{ - "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; + "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx", "v+++___+++___+++w"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } + + // Turn off separator-on-nulls + { + auto const results = cudf::strings::join_list_elements(string_lv, + cudf::string_scalar("+++"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + std::vector h_expected{"a+++ccc", nullptr, "efgh+++ijk", "zzz+++xxxxx", "v+++w"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); @@ -181,8 +198,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the entire lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{nullptr, nullptr, nullptr, @@ -202,7 +218,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the entire lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{"a+++___+++ccc", nullptr, @@ -223,8 +239,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the first half of the lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{nullptr, nullptr, nullptr, "zzz+++xxxxx"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -234,7 +249,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the first half of the lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{ "a+++___+++ccc", nullptr, "___+++efgh+++ijk", "zzz+++xxxxx"}; @@ -246,8 +261,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the second half of the lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{ nullptr, nullptr, "0a0b0c+++5x5y5z", nullptr, "ééé+++12345abcdef", "aaaééébbbéééccc+++12345"}; auto const expected = @@ -258,7 +272,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the second half of the lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{"abcdef+++012345+++___+++xxx000", "___+++11111+++00000", @@ -274,8 +288,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the middle part of the lists column, no null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); - auto const results = - cudf::strings::concatenate_list_elements(string_lv, cudf::string_scalar("+++")); + auto const results = cudf::strings::join_list_elements(string_lv, cudf::string_scalar("+++")); std::vector h_expected{ "zzz+++xxxxx", nullptr, nullptr, nullptr, "0a0b0c+++5x5y5z"}; auto const expected = @@ -286,7 +299,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithScalarSeparator) // Sliced the middle part of the lists column, with null replacement { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, cudf::string_scalar("+++"), cudf::string_scalar("___")); std::vector h_expected{"zzz+++xxxxx", nullptr, @@ -318,7 +331,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // No null replacement { - auto const results = cudf::strings::concatenate_list_elements(string_lv, separators->view()); + auto const results = cudf::strings::join_list_elements(string_lv, separators->view()); std::vector h_expected{nullptr, nullptr, nullptr, nullptr, nullptr, "zzz^^^xxxxx"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -327,8 +340,8 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // With null replacement for separators { - auto const results = cudf::strings::concatenate_list_elements( - string_lv, separators->view(), cudf::string_scalar("|||")); + auto const results = + cudf::strings::join_list_elements(string_lv, separators->view(), cudf::string_scalar("|||")); std::vector h_expected{ nullptr, nullptr, "0a0b0c|||xyzééé", nullptr, nullptr, "zzz^^^xxxxx"}; auto const expected = @@ -338,7 +351,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // With null replacement for strings { - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, separators->view(), cudf::string_scalar("", false), cudf::string_scalar("XXXXX")); std::vector h_expected{ "a+++XXXXX+++ccc", nullptr, nullptr, nullptr, "XXXXX%%%ááá%%%ííí", "zzz^^^xxxxx"}; @@ -349,7 +362,7 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) // With null replacement for both separators and strings { - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, separators->view(), cudf::string_scalar("|||"), cudf::string_scalar("XXXXX")); std::vector h_expected{"a+++XXXXX+++ccc", nullptr, @@ -361,6 +374,20 @@ TEST_F(StringsListsConcatenateTest, ColumnSeparators) STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); } + + // Turn off separator-on-nulls + { + auto const results = cudf::strings::join_list_elements(string_lv, + separators->view(), + cudf::string_scalar("+++"), + cudf::string_scalar(""), + cudf::strings::separator_on_nulls::NO); + std::vector h_expected{ + "a+++ccc", nullptr, "0a0b0c+++xyzééé", "efgh+++ijk", "ááá%%%ííí", "zzz^^^xxxxx"}; + auto const expected = + STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected, print_all); + } } TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) @@ -390,7 +417,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{nullptr, nullptr, nullptr, @@ -411,7 +438,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{"a+++___+++ccc", nullptr, @@ -433,7 +460,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{nullptr, nullptr, nullptr, nullptr}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -444,7 +471,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {0, 4})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {0, 4})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{ "a+++___+++ccc", nullptr, "___|||efgh|||ijk", "zzz|||xxxxx"}; @@ -457,7 +484,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{ nullptr, nullptr, "0a0b0c###5x5y5z", nullptr, "ééé-+-12345abcdef", "aaaééébbbéééccc=+=12345"}; auto const expected = @@ -469,7 +496,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {5, 11})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {5, 11})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{"abcdef^^^012345^^^___^^^xxx000", "___~!~11111~!~00000", @@ -486,7 +513,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements(string_lv, sep_col); + auto const results = cudf::strings::join_list_elements(string_lv, sep_col); std::vector h_expected{nullptr, nullptr, nullptr, nullptr, "0a0b0c###5x5y5z"}; auto const expected = STR_COL{h_expected.begin(), h_expected.end(), nulls_from_nullptr(h_expected)}; @@ -497,7 +524,7 @@ TEST_F(StringsListsConcatenateTest, SlicedListsWithColumnSeparators) { auto const string_lv = cudf::lists_column_view(cudf::slice(string_lists->view(), {3, 8})[0]); auto const sep_col = cudf::strings_column_view(cudf::slice(separators->view(), {3, 8})[0]); - auto const results = cudf::strings::concatenate_list_elements( + auto const results = cudf::strings::join_list_elements( string_lv, sep_col, cudf::string_scalar("|||"), cudf::string_scalar("___")); std::vector h_expected{"zzz|||xxxxx", nullptr, diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/cpp/strings/combine.pxd index 250c6441882..51c706b68d0 100644 --- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd +++ b/python/cudf/cudf/_lib/cpp/strings/combine.pxd @@ -18,13 +18,13 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil: string_scalar separator, string_scalar narep) except + - cdef unique_ptr[column] concatenate_list_elements( + cdef unique_ptr[column] join_list_elements( column_view lists_strings_column, column_view separators, string_scalar separator_narep, string_scalar string_narep) except + - cdef unique_ptr[column] concatenate_list_elements( + cdef unique_ptr[column] join_list_elements( column_view lists_strings_column, string_scalar separator, string_scalar narep) except + diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx index 25619de3ed0..0d7dfb5c619 100644 --- a/python/cudf/cudf/_lib/strings/combine.pyx +++ b/python/cudf/cudf/_lib/strings/combine.pyx @@ -16,7 +16,7 @@ from cudf._lib.table cimport Table from cudf._lib.cpp.strings.combine cimport ( concatenate as cpp_concatenate, join_strings as cpp_join_strings, - concatenate_list_elements as cpp_concatenate_list_elements + join_list_elements as cpp_join_list_elements ) @@ -105,7 +105,7 @@ def join_lists_with_scalar( ) with nogil: - c_result = move(cpp_concatenate_list_elements( + c_result = move(cpp_join_list_elements( source_view, scalar_separator[0], scalar_narep[0] @@ -142,7 +142,7 @@ def join_lists_with_column( ) with nogil: - c_result = move(cpp_concatenate_list_elements( + c_result = move(cpp_join_list_elements( source_view, separator_view, scalar_separator_narep[0],