Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Remove string_view is_null method #4548

Merged
merged 17 commits into from
Mar 23, 2020
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
- PR #4073 Enable contiguous split java test
- PR #4527 Add JNI and java bindings for matches_re
- PR #4599 Add Java and JNI bindings for string replace
- PR #4548 Remove string_view is_null method

## Bug Fixes

Expand Down
4 changes: 1 addition & 3 deletions cpp/include/cudf/scalar/scalar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,9 +304,7 @@ class string_scalar : public scalar {
*
* @param stream The CUDA stream to do the operation in
*/
value_type value(cudaStream_t stream = 0) const {
return (is_valid(stream) && size()==0) ? value_type{"",0} : value_type{data(), size()};
}
value_type value(cudaStream_t stream = 0) const { return value_type{data(), size()}; }

/**
* @brief Returns the size of the string in bytes
Expand Down
10 changes: 6 additions & 4 deletions cpp/include/cudf/scalar/scalar_device_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,7 @@ class string_scalar_device_view
/**
* @brief Returns string_view of the value of this scalar.
*/
__device__ string_view value() const noexcept {
return (is_valid() && _size==0) ? string_view{"",0} :
string_view(this->data(), _size);
}
__device__ ValueType value() const noexcept { return ValueType{this->data(), _size}; }

/**
* @brief Returns a raw pointer to the value in device memory
Expand All @@ -170,6 +167,11 @@ class string_scalar_device_view
return static_cast<char const*>(_data);
}

/**
* @brief Returns the size of the string in bytes.
*/
__device__ size_type size() const noexcept { return _size; }

private:
const char* _data{}; ///< Pointer to device memory containing the value
size_type _size; ///< Size of the string in bytes
Expand Down
5 changes: 0 additions & 5 deletions cpp/include/cudf/strings/string_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,6 @@ class string_view
* @brief Return true if string has no characters
*/
__host__ __device__ bool empty() const;
/**
* @brief Return true if string is NULL.
* That is, `data()==nullptr` for this instance.
*/
__host__ __device__ bool is_null() const;

/**
* @brief Handy iterator for navigating through encoded characters.
Expand Down
9 changes: 3 additions & 6 deletions cpp/include/cudf/strings/string_view.inl
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,6 @@ __host__ __device__ inline bool string_view::empty() const
return _bytes == 0;
}

__host__ __device__ inline bool string_view::is_null() const
{
return _data == nullptr;
}

// this custom iterator knows about UTF8 encoding
__device__ inline string_view::const_iterator::const_iterator(const string_view& str, size_type pos)
: p{str.data()}, bytes{str.size_bytes()},
Expand Down Expand Up @@ -261,13 +256,15 @@ __device__ inline int string_view::compare(const string_view& in) const

__device__ inline int string_view::compare(const char* data, size_type bytes) const
{
size_type const len1 = size_bytes();
if( len1==0 && bytes==0 )
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
return 0;
const unsigned char* ptr1 = reinterpret_cast<const unsigned char*>(this->data());
if(!ptr1)
return -1;
const unsigned char* ptr2 = reinterpret_cast<const unsigned char*>(data);
if(!ptr2)
return 1;
size_type len1 = size_bytes();
size_type idx = 0;
for(; (idx < len1) && (idx < bytes); ++idx)
{
Expand Down
33 changes: 13 additions & 20 deletions cpp/src/strings/combine.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <cudf/column/column_factories.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/combine.hpp>
Expand Down Expand Up @@ -58,11 +59,7 @@ std::unique_ptr<column> concatenate( table_view const& strings_columns,

CUDF_EXPECTS( separator.is_valid(), "Parameter separator must be a valid string_scalar");
string_view d_separator(separator.data(),separator.size());
string_view const d_narep = [&narep] {
if( !narep.is_valid() )
return string_view(nullptr,0);
return narep.size()==0 ? string_view("",0) : string_view(narep.data(),narep.size());
} ();
auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));

// Create device views from the strings columns.
auto table = table_device_view::create(strings_columns,stream);
Expand All @@ -75,7 +72,7 @@ std::unique_ptr<column> concatenate( table_view const& strings_columns,
[d_table, d_narep] __device__ (size_type idx) {
bool null_element = thrust::any_of( thrust::seq, d_table.begin(), d_table.end(),
[idx] (auto col) { return col.is_null(idx);});
return( !null_element || !d_narep.is_null() );
return( !null_element || d_narep.is_valid() );
}, stream, mr );
rmm::device_buffer null_mask = valid_mask.first;
auto null_count = valid_mask.second;
Expand All @@ -85,12 +82,12 @@ std::unique_ptr<column> concatenate( table_view const& strings_columns,
// for this row (idx), iterate over each column and add up the bytes
bool null_element = thrust::any_of( thrust::seq, d_table.begin(), d_table.end(),
[row_idx] (auto const& d_column) { return d_column.is_null(row_idx);});
if( null_element && d_narep.is_null() )
if( null_element && !d_narep.is_valid() )
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
return 0;
size_type bytes = thrust::transform_reduce( thrust::seq, d_table.begin(), d_table.end(),
[row_idx, d_separator, d_narep] __device__ (column_device_view const& d_column) {
return d_separator.size_bytes() +
(d_column.is_null(row_idx) ? d_narep.size_bytes()
(d_column.is_null(row_idx) ? d_narep.size()
: d_column.element<string_view>(row_idx).size_bytes());
}, 0, thrust::plus<size_type>());
// separator goes only in between elements
Expand All @@ -113,15 +110,15 @@ std::unique_ptr<column> concatenate( table_view const& strings_columns,
[d_table, num_columns, d_separator, d_narep, d_results_offsets, d_results_chars] __device__(size_type idx){
bool null_element = thrust::any_of( thrust::seq, d_table.begin(), d_table.end(),
[idx] (column_device_view const& col) { return col.is_null(idx);});
if( null_element && d_narep.is_null() )
if( null_element && !d_narep.is_valid() )
return; // do not write to buffer at all if any column element for this row is null
size_type offset = d_results_offsets[idx];
char* d_buffer = d_results_chars + offset;
// write out each column's entry for this row
for( size_type col_idx=0; col_idx < num_columns; ++col_idx )
{
auto d_column = d_table.column(col_idx);
string_view d_str = d_column.is_null(idx) ? d_narep : d_column.element<string_view>(idx);
string_view d_str = d_column.is_null(idx) ? d_narep.value() : d_column.element<string_view>(idx);
d_buffer = detail::copy_string(d_buffer, d_str);
// separator goes only in between elements
if( col_idx+1 < num_columns )
Expand All @@ -148,11 +145,7 @@ std::unique_ptr<column> join_strings( strings_column_view const& strings,

auto execpol = rmm::exec_policy(stream);
string_view d_separator(separator.data(),separator.size());
string_view const d_narep = [&narep] {
if( !narep.is_valid() )
return string_view(nullptr,0);
return narep.size()==0 ? string_view("",0) : string_view(narep.data(),narep.size());
} ();
auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));

auto strings_column = column_device_view::create(strings.parent(),stream);
auto d_strings = *strings_column;
Expand All @@ -169,9 +162,9 @@ std::unique_ptr<column> join_strings( strings_column_view const& strings,
size_type bytes = 0;
if( d_strings.is_null(idx) )
{
if( d_narep.is_null() )
if( !d_narep.is_valid() )
return 0; // skip nulls
bytes += d_narep.size_bytes();
bytes += d_narep.size();
}
else
bytes += d_strings.element<string_view>(idx).size_bytes();
Expand All @@ -197,7 +190,7 @@ std::unique_ptr<column> join_strings( strings_column_view const& strings,
// only one entry so it is either all valid or all null
size_type null_count = 0;
rmm::device_buffer null_mask; // init to null null-mask
if( strings.null_count()==strings_count && d_narep.is_null() )
if( strings.null_count()==strings_count && !narep.is_valid() )
{
null_mask = create_null_mask(1,cudf::mask_state::ALL_NULL,stream,mr);
null_count = 1;
Expand All @@ -211,9 +204,9 @@ std::unique_ptr<column> join_strings( strings_column_view const& strings,
char* d_buffer = d_chars + offset;
if( d_strings.is_null(idx) )
{
if( d_narep.is_null() )
if( !d_narep.is_valid() )
return; // do not write to buffer if element is null (including separator)
d_buffer = detail::copy_string(d_buffer, d_narep);
d_buffer = detail::copy_string(d_buffer, d_narep.value());
}
else
{
Expand Down
23 changes: 8 additions & 15 deletions cpp/src/strings/filling/fill.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include <cudf/null_mask.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/combine.hpp>
Expand Down Expand Up @@ -47,11 +48,7 @@ std::unique_ptr<column> fill( strings_column_view const& strings,
return std::make_unique<column>( strings.parent() );

// string_scalar.data() is null for valid, empty strings
string_view const d_value = [&] {
if( !value.is_valid() )
return string_view(nullptr,0);
return value.size()==0 ? string_view("",0) : string_view(value.data(),value.size());
} ();
auto d_value = get_scalar_device_view(const_cast<string_scalar&>(value));

auto strings_column = column_device_view::create(strings.parent(),stream);
auto d_strings = *strings_column;
Expand All @@ -61,19 +58,16 @@ std::unique_ptr<column> fill( strings_column_view const& strings,
thrust::make_counting_iterator<size_type>(0),
thrust::make_counting_iterator<size_type>(strings_count),
[d_strings, begin, end, d_value] __device__ (size_type idx) {
return ((begin <= idx) && (idx < end)) ? !d_value.is_null() : !d_strings.is_null(idx);
return ((begin <= idx) && (idx < end)) ? d_value.is_valid() : !d_strings.is_null(idx);
}, stream, mr );
auto null_count = valid_mask.second;
rmm::device_buffer& null_mask = valid_mask.first;

// build offsets column
auto offsets_transformer = [d_strings, begin, end, d_value] __device__ (size_type idx) {
if( ((begin <= idx) && (idx < end)) ? d_value.is_null() : d_strings.is_null(idx) )
if( ((begin <= idx) && (idx < end)) ? !d_value.is_valid() : d_strings.is_null(idx) )
return 0;
int32_t bytes = d_value.size_bytes();
if( (idx < begin) || (idx >= end) )
bytes = d_strings.element<string_view>(idx).size_bytes();
return bytes;
return ((begin <= idx) && (idx < end)) ? d_value.size() : d_strings.element<string_view>(idx).size_bytes();
};
auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator<size_type>(0), offsets_transformer );
auto offsets_column = detail::make_offsets_child_column(offsets_transformer_itr,
Expand All @@ -88,11 +82,10 @@ std::unique_ptr<column> fill( strings_column_view const& strings,
auto d_chars = chars_column->mutable_view().data<char>();
thrust::for_each_n(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator<size_type>(0), strings_count,
[d_strings, begin, end, d_value, d_offsets, d_chars] __device__(size_type idx){
if( ((begin <= idx) && (idx < end)) ? d_value.is_null() : d_strings.is_null(idx) )
if( ((begin <= idx) && (idx < end)) ? !d_value.is_valid() : d_strings.is_null(idx) )
return;
string_view d_str = d_value;
if( (idx < begin) || (idx >= end) )
d_str = d_strings.element<string_view>(idx);
string_view const d_str = ((begin <= idx) && (idx < end)) ? d_value.value()
: d_strings.element<string_view>(idx);
memcpy( d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes() );
});

Expand Down
13 changes: 6 additions & 7 deletions cpp/src/strings/utilities.cu
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ std::unique_ptr<cudf::column> child_chars_from_string_vector(
auto d_strings = strings.data().get();
auto execpol = rmm::exec_policy(stream);
size_type bytes = thrust::device_pointer_cast(d_offsets)[count];
if( (bytes==0) && (null_count < count) )
bytes = 1; // all entries are empty strings
// if( (bytes==0) && (null_count < count) )
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
// bytes = 1; // all entries are empty strings

// create column
auto chars_column = make_numeric_column( data_type{INT8}, bytes,
Expand All @@ -111,9 +111,8 @@ std::unique_ptr<cudf::column> child_chars_from_string_vector(
thrust::for_each_n(execpol->on(stream),
thrust::make_counting_iterator<size_type>(0), count,
[d_strings, d_offsets, d_chars] __device__(size_type idx){
string_view d_str = d_strings[idx];
if( !d_str.is_null() )
memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes() );
string_view const d_str = d_strings[idx];
memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes() );
});

return chars_column;
Expand All @@ -129,8 +128,8 @@ std::unique_ptr<column> create_chars_child_column( cudf::size_type strings_count
// If we have all nulls, a null chars column is allowed.
// If all non-null strings are empty strings, we need a non-null chars column.
// In this case we set the bytes to 1 to create a minimal one-byte chars column.
if( (total_bytes==0) && (null_count < strings_count) )
total_bytes = 1; // all entries are empty strings (not nulls)
// if( (total_bytes==0) && (null_count < strings_count) )
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
// total_bytes = 1; // all entries are empty strings (not nulls)
// return chars column
return make_numeric_column( data_type{INT8}, total_bytes, mask_state::UNALLOCATED, stream, mr );
}
Expand Down
14 changes: 7 additions & 7 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1913,13 +1913,13 @@ def __init__(self, mask=None, size=None, offset=0, children=()):

# For an "all empty" StringColumn (e.g., [""]) libcudf still
# needs the chars child column pointer to be non-null:
if self.size:
if self.children[1].size == 0 and self.null_count != self.size:
offsets = self.base_children[0]
chars = column_empty(
self.base_children[1].size + 1, dtype="int8"
)
self.set_base_children((offsets, chars))
# if self.size:
# if self.children[1].size == 0 and self.null_count != self.size:
# offsets = self.base_children[0]
# chars = column_empty(
# self.base_children[1].size + 1, dtype="int8"
# )
# self.set_base_children((offsets, chars))

# TODO: Remove these once NVStrings is fully deprecated / removed
self._nvstrings = None
Expand Down