Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add separator-on-null parameter to strings concatenate APIs #8282

Merged
merged 10 commits into from
May 24, 2021
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -333,8 +333,8 @@ add_library(cudf
src/strings/char_types/char_cases.cu
src/strings/char_types/char_types.cu
src/strings/combine/concatenate.cu
src/strings/combine/concatenate_list_elements.cu
src/strings/combine/join.cu
src/strings/combine/join_list_elements.cu
src/strings/contains.cu
src/strings/convert/convert_booleans.cu
src/strings/convert/convert_datetime.cu
Expand Down
134 changes: 87 additions & 47 deletions cpp/include/cudf/strings/combine.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -30,12 +30,21 @@ namespace strings {
* @brief Strings APIs for concatenate and join
*/

/**
* @brief Setting for specifying how separators are added with
* null strings elements.
*/
enum class separator_on_nulls {
YES, ///< Always add separators between elements
NO ///< Do not add separators if an element is null
};

/**
* @brief Concatenates all strings in the column into one new string delimited
* by an optional separator string.
*
* This returns a column with one string. Any null entries are ignored unless
* the narep parameter specifies a replacement string.
* the @p narep parameter specifies a replacement string.
*
* @code{.pseudo}
* Example:
Expand Down Expand Up @@ -70,11 +79,9 @@ std::unique_ptr<column> join_strings(
*
* - If row separator for a given row is null, output column for that row is null, unless
* there is a valid @p separator_narep
* - If all column values for a given row is null, output column for that row is null, unless
* there is a valid @p col_narep
* - null column values for a given row are skipped, if the column replacement isn't valid
* - The separator is only applied between two valid column values
* - If valid @p separator_narep and @p col_narep are provided, the output column is always
* - The separator is applied between two output row values if the @p separate_nulls
* is `YES` or only between valid rows if @p separate_nulls is `NO`.
* - If @p separator_narep and @p col_narep are both valid, the output column is always
* non nullable
*
* @code{.pseudo}
Expand All @@ -83,16 +90,25 @@ std::unique_ptr<column> join_strings(
* c1 = [null, 'cc', 'dd', null, null, 'gg']
* c2 = ['bb', '', null, null, null, 'hh']
* sep = ['::', '%%', '^^', '!', '*', null]
* out0 = concatenate([c0, c1, c2], sep)
* out0 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, null]
* out = concatenate({c0, c1, c2}, sep)
* // all rows have at least one null or sep[i]==null
* out is [null, null, null, null, null, null]
*
* sep_rep = '+'
* out1 = concatenate([c0, c1, c2], sep, sep_rep)
* out1 is ['aa::bb', 'cc%%', '^^dd', 'ee', null, 'ff+gg+hh']
*
* col_rep = '-'
* out2 = concatenate([c0, c1, c2], sep, invalid_sep_rep, col_rep)
* out2 is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null]
* out = concatenate({c0, c1, c2}, sep, sep_rep)
* // all rows with at least one null output as null
* out is [null, null, null, null, null, 'ff+gg+hh']
*
* col_narep = '-'
* sep_na = non-valid scalar
* out = concatenate({c0, c1, c2}, sep, sep_na, col_narep)
* // only the null entry in the sep column produces a null row
* out is ['aa::-::bb', '-%%cc%%', '^^dd^^-', 'ee!-!-', '-*-*-', null]
*
* col_narep = ''
* out = concatenate({c0, c1, c2}, sep, sep_rep, col_narep, separator_on_nulls:NO)
* // parameter suppresses separator for null rows
* out is ['aa::bb', 'cc%%', '^^dd', 'ee', '', 'ff+gg+hh']
* @endcode
*
* @throw cudf::logic_error if no input columns are specified - table view is empty
Expand All @@ -108,6 +124,8 @@ std::unique_ptr<column> join_strings(
* @param col_narep String that should be used in place of any null strings
* found in any column. Default of invalid-scalar means no null column value replacements.
* Default is an invalid string.
* @param separate_nulls If YES, then the separator is included for null rows
* if `col_narep` is valid.
* @param mr Resource for allocating device memory.
* @return New column with concatenated results.
*/
Expand All @@ -116,15 +134,9 @@ std::unique_ptr<column> concatenate(
strings_column_view const& separators,
string_scalar const& separator_narep = string_scalar("", false),
string_scalar const& col_narep = string_scalar("", false),
separator_on_nulls separate_nulls = separator_on_nulls::YES,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @addtogroup strings_combine
* @{
* @file strings/combine.hpp
* @brief Strings APIs for concatenate and join
*/

/**
* @brief Row-wise concatenates the given list of strings columns and
* returns a single strings column result.
Expand All @@ -136,59 +148,77 @@ std::unique_ptr<column> concatenate(
* row to be null entry unless a narep string is specified to be used
* in its place.
*
* The number of strings in the columns provided must be the same.
* If @p separate_nulls is set to `NO` and @p narep is valid then
* separators are not added to the output between null elements.
* Otherwise, separators are always added if @p narep is valid.
*
* More than one column must be specified in the input @p strings_columns
* table.
*
* @code{.pseudo}
* Example:
* s1 = ['aa', null, '', 'aa']
* s2 = ['', 'bb', 'bb', null]
* r1 = concatenate([s1,s2])
* r1 is ['aa', null, 'bb', null]
* r2 = concatenate([s1,s2],':','_')
* r2 is ['aa:', '_:bb', ':bb', 'aa:_']
* s1 = ['aa', null, '', 'dd']
* s2 = ['', 'bb', 'cc', null]
* out = concatenate({s1, s2})
* out is ['aa', null, 'cc', null]
*
* out = concatenate({s1, s2}, ':', '_')
* out is ['aa:', '_:bb', ':cc', 'dd:_']
*
* out = concatenate({s1, s2}, ':', '', separator_on_nulls::NO)
* out is ['aa:', 'bb', ':cc', 'dd']
* @endcode
*
* @throw cudf::logic_error if input columns are not all strings columns.
* @throw cudf::logic_error if separator is not valid.
* @throw cudf::logic_error if only one column is specified
*
* @param strings_columns List of string columns to concatenate.
* @param separator String that should inserted between each string from each row.
* Default is an empty string.
* @param narep String that should be used in place of any null strings
* found in any column. Default of invalid-scalar means any null entry in any column will
* produces a null result for that row.
* @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column with concatenated results.
*/
std::unique_ptr<column> concatenate(
table_view const& strings_columns,
string_scalar const& separator = string_scalar(""),
string_scalar const& narep = string_scalar("", false),
separator_on_nulls separate_nulls = separator_on_nulls::YES,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
* within each row and returns a single strings column result.
*
* Each new string is created by concatenating the strings from the same row (same list element)
* delimited by the row separator provided in the `separators` strings column.
* delimited by the row separator provided in the @p separators strings column.
*
* A null list row will always result in a null string in the output row. Any non-null list row
* having a null element will result in the corresponding output row to be null unless a valid
* `string_narep` scalar is provided to be used in its place. Any null row in the `separators`
* column will also result in a null output row unless a valid `separator_narep` scalar is provided
* @p string_narep scalar is provided to be used in its place. Any null row in the @p separators
* column will also result in a null output row unless a valid @p separator_narep scalar is provided
* to be used in place of the null separators.
*
* If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the
* output between null elements. Otherwise, separators are always added if @p narep is valid.
*
* @code{.pseudo}
* Example:
* s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff', 'gg'} ]
* s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff', 'gg'] ]
* sep = ['::', '%%', '!', '*', null]
*
* r1 = strings::concatenate_list_elements(s, sep)
* r1 is ['aa::bb::cc', null, '!dd', null, null]
* out = join_list_elements(s, sep)
* out is ['aa::bb::cc', null, '!dd', null, null]
*
* out = join_list_elements(s, sep, ':', '_')
* out is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg']
*
* r2 = strings::concatenate_list_elements(s, sep, ':', '_')
* r2 is ['aa::bb::cc', null, '!dd', 'ee*_', 'ff:gg']
* out = join_list_elements(s, sep, ':', '', separator_on_nulls::NO)
* out is ['aa::bb::cc', null, '!dd', 'ee', 'ff:gg']
* @endcode
*
* @throw cudf::logic_error if input column is not lists of strings column.
Expand All @@ -203,36 +233,44 @@ std::unique_ptr<column> concatenate(
* @param string_narep String that should be used to replace null strings in any non-null list row,
* default is an invalid-scalar denoting that list rows containing null strings will result
* in null string in the corresponding output rows.
* @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column with concatenated results.
*/
std::unique_ptr<column> concatenate_list_elements(
std::unique_ptr<column> join_list_elements(
const lists_column_view& lists_strings_column,
const strings_column_view& separators,
string_scalar const& separator_narep = string_scalar("", false),
string_scalar const& string_narep = string_scalar("", false),
separator_on_nulls separate_nulls = separator_on_nulls::YES,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
* within each row and returns a single strings column result.
*
* Each new string is created by concatenating the strings from the same row (same list element)
* delimited by the separator provided.
* delimited by the @p separator provided.
*
* A null list row will always result in a null string in the output row. Any non-null list row
* having a null elenent will result in the corresponding output row to be null unless a narep
* string is specified to be used in its place.
* having a null elenent will result in the corresponding output row to be null unless a
* @p narep string is specified to be used in its place.
*
* If @p separate_nulls is set to `NO` and @p narep is valid then separators are not added to the
* output between null elements. Otherwise, separators are always added if @p narep is valid.
*
* @code{.pseudo}
* Example:
* s = [ {'aa', 'bb', 'cc'}, null, {'', 'dd'}, {'ee', null}, {'ff'} ]
* s = [ ['aa', 'bb', 'cc'], null, ['', 'dd'], ['ee', null], ['ff'] ]
*
* out = join_list_elements(s)
* out is ['aabbcc', null, 'dd', null, 'ff']
*
* r1 = strings::concatenate_list_elements(s)
* r1 is ['aabbcc', null, 'dd', null, 'ff']
* out = join_list_elements(s, ':', '_')
* out is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff']
*
* r2 = strings::concatenate_list_elements(s, ':', '_')
* r2 is ['aa:bb:cc', null, ':dd', 'ee:_', 'ff']
* out = join_list_elements(s, ':', '', separator_on_nulls::NO)
* out is ['aa:bb:cc', null, ':dd', 'ee', 'ff']
* @endcode
*
* @throw cudf::logic_error if input column is not lists of strings column.
Expand All @@ -244,13 +282,15 @@ std::unique_ptr<column> concatenate_list_elements(
* @param narep String that should be used to replace null strings in any non-null list row, default
* is an invalid-scalar denoting that list rows containing null strings will result in null
* string in the corresponding output rows.
* @param separate_nulls If YES, then the separator is included for null rows if `narep` is valid.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New strings column with concatenated results.
*/
std::unique_ptr<column> concatenate_list_elements(
std::unique_ptr<column> join_list_elements(
const lists_column_view& lists_strings_column,
string_scalar const& separator = string_scalar(""),
string_scalar const& narep = string_scalar("", false),
separator_on_nulls separate_nulls = separator_on_nulls::YES,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/cudf/strings/detail/combine.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -17,6 +17,7 @@

#include <cudf/column/column.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/combine.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table_view.hpp>

Expand All @@ -36,6 +37,7 @@ std::unique_ptr<column> concatenate(
table_view const& strings_columns,
string_scalar const& separator,
string_scalar const& narep,
separator_on_nulls separate_nulls = separator_on_nulls::YES,
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
rmm::cuda_stream_view stream = rmm::cuda_stream_default,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

Expand Down
15 changes: 12 additions & 3 deletions cpp/src/io/csv/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/detail/combine.hpp>
#include <cudf/strings/detail/converters.hpp>
#include <cudf/strings/detail/replace.hpp>
#include <cudf/strings/detail/utilities.cuh>

#include <rmm/cuda_stream_view.hpp>
Expand Down Expand Up @@ -404,11 +405,19 @@ void writer::impl::write(table_view const& table,
auto str_table_view = str_table_ptr->view();

// concatenate columns in each row into one big string column
//(using null representation and delimiter):
// (using null representation and delimiter):
//
std::string delimiter_str{options_.get_inter_column_delimiter()};
auto str_concat_col = cudf::strings::detail::concatenate(
str_table_view, delimiter_str, options_.get_na_rep(), stream);
auto str_concat_col = [&] {
if (str_table_view.num_columns() > 1)
return cudf::strings::detail::concatenate(str_table_view,
delimiter_str,
options_.get_na_rep(),
strings::separator_on_nulls::YES,
stream);
cudf::string_scalar narep{options_.get_na_rep()};
return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream);
}();

write_chunked(str_concat_col->view(), metadata, stream);
}
Expand Down
Loading