From fab7c4f17a8af46f5fa96e091e115313eb33bf87 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Mon, 16 Mar 2020 13:31:32 -0400 Subject: [PATCH 1/9] revert null/empty logic --- cpp/include/cudf/scalar/scalar.hpp | 4 +--- cpp/include/cudf/scalar/scalar_device_view.cuh | 10 ++++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp index dbd4e0a52cc..d0c1d8cd61e 100644 --- a/cpp/include/cudf/scalar/scalar.hpp +++ b/cpp/include/cudf/scalar/scalar.hpp @@ -304,9 +304,7 @@ class string_scalar : public scalar { * * @param stream The CUDA stream to do the operation in */ - value_type value(cudaStream_t stream = 0) const { - return (is_valid(stream) && size()==0) ? value_type{"",0} : value_type{data(), size()}; - } + value_type value(cudaStream_t stream = 0) const { return value_type{data(), size()}; } /** * @brief Returns the size of the string in bytes diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh index e8c2245b38a..bdf6be180e8 100644 --- a/cpp/include/cudf/scalar/scalar_device_view.cuh +++ b/cpp/include/cudf/scalar/scalar_device_view.cuh @@ -158,10 +158,7 @@ class string_scalar_device_view /** * @brief Returns string_view of the value of this scalar. */ - __device__ string_view value() const noexcept { - return (is_valid() && _size==0) ? string_view{"",0} : - string_view(this->data(), _size); - } + __device__ ValueType value() const noexcept { return ValueType{this->data(), _size}; } /** * @brief Returns a raw pointer to the value in device memory @@ -170,6 +167,11 @@ class string_scalar_device_view return static_cast(_data); } + /** + * @brief Returns the size of the string in bytes. + */ + __device__ size_type size() const noexcept { return _size; } + private: const char* _data{}; ///< Pointer to device memory containing the value size_type _size; ///< Size of the string in bytes From b09f99cf79920ceed8c4d78511da0a6878a64a22 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Mon, 16 Mar 2020 13:32:00 -0400 Subject: [PATCH 2/9] remove is_null --- cpp/include/cudf/strings/string_view.cuh | 5 ----- cpp/include/cudf/strings/string_view.inl | 9 +++------ 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh index 9b9e6f481bf..7501b79333a 100644 --- a/cpp/include/cudf/strings/string_view.cuh +++ b/cpp/include/cudf/strings/string_view.cuh @@ -79,11 +79,6 @@ class string_view * @brief Return true if string has no characters */ __host__ __device__ bool empty() const; - /** - * @brief Return true if string is NULL. - * That is, `data()==nullptr` for this instance. - */ - __host__ __device__ bool is_null() const; /** * @brief Handy iterator for navigating through encoded characters. diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl index acf2ece8845..d6e9dd9e0fe 100644 --- a/cpp/include/cudf/strings/string_view.inl +++ b/cpp/include/cudf/strings/string_view.inl @@ -97,11 +97,6 @@ __host__ __device__ inline bool string_view::empty() const return _bytes == 0; } -__host__ __device__ inline bool string_view::is_null() const -{ - return _data == nullptr; -} - // this custom iterator knows about UTF8 encoding __device__ inline string_view::const_iterator::const_iterator(const string_view& str, size_type pos) : p{str.data()}, bytes{str.size_bytes()}, @@ -261,13 +256,15 @@ __device__ inline int string_view::compare(const string_view& in) const __device__ inline int string_view::compare(const char* data, size_type bytes) const { + size_type const len1 = size_bytes(); + if( len1==0 && bytes==0 ) + return 0; const unsigned char* ptr1 = reinterpret_cast(this->data()); if(!ptr1) return -1; const unsigned char* ptr2 = reinterpret_cast(data); if(!ptr2) return 1; - size_type len1 = size_bytes(); size_type idx = 0; for(; (idx < len1) && (idx < bytes); ++idx) { From dd960529328359717c74cfd55b882bd14a2c61e1 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Mon, 16 Mar 2020 13:32:22 -0400 Subject: [PATCH 3/9] remove 1-byte chars col logic --- cpp/src/strings/utilities.cu | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index da83c0787c0..cb12293b080 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -99,8 +99,8 @@ std::unique_ptr child_chars_from_string_vector( auto d_strings = strings.data().get(); auto execpol = rmm::exec_policy(stream); size_type bytes = thrust::device_pointer_cast(d_offsets)[count]; - if( (bytes==0) && (null_count < count) ) - bytes = 1; // all entries are empty strings +// if( (bytes==0) && (null_count < count) ) +// bytes = 1; // all entries are empty strings // create column auto chars_column = make_numeric_column( data_type{INT8}, bytes, @@ -111,9 +111,8 @@ std::unique_ptr child_chars_from_string_vector( thrust::for_each_n(execpol->on(stream), thrust::make_counting_iterator(0), count, [d_strings, d_offsets, d_chars] __device__(size_type idx){ - string_view d_str = d_strings[idx]; - if( !d_str.is_null() ) - memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes() ); + string_view const d_str = d_strings[idx]; + memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes() ); }); return chars_column; @@ -129,8 +128,8 @@ std::unique_ptr create_chars_child_column( cudf::size_type strings_count // If we have all nulls, a null chars column is allowed. // If all non-null strings are empty strings, we need a non-null chars column. // In this case we set the bytes to 1 to create a minimal one-byte chars column. - if( (total_bytes==0) && (null_count < strings_count) ) - total_bytes = 1; // all entries are empty strings (not nulls) +// if( (total_bytes==0) && (null_count < strings_count) ) +// total_bytes = 1; // all entries are empty strings (not nulls) // return chars column return make_numeric_column( data_type{INT8}, total_bytes, mask_state::UNALLOCATED, stream, mr ); } From 0993e097491a7e229fe3a762bce4144780d3c3eb Mon Sep 17 00:00:00 2001 From: davidwendt Date: Mon, 16 Mar 2020 13:33:16 -0400 Subject: [PATCH 4/9] use scalar:is_valid instead of string_view:is_null --- cpp/src/strings/combine.cu | 33 +++++++++++++-------------------- cpp/src/strings/filling/fill.cu | 23 ++++++++--------------- 2 files changed, 21 insertions(+), 35 deletions(-) diff --git a/cpp/src/strings/combine.cu b/cpp/src/strings/combine.cu index 9863d5cb583..fd6766657c2 100644 --- a/cpp/src/strings/combine.cu +++ b/cpp/src/strings/combine.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -58,11 +59,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, CUDF_EXPECTS( separator.is_valid(), "Parameter separator must be a valid string_scalar"); string_view d_separator(separator.data(),separator.size()); - string_view const d_narep = [&narep] { - if( !narep.is_valid() ) - return string_view(nullptr,0); - return narep.size()==0 ? string_view("",0) : string_view(narep.data(),narep.size()); - } (); + auto d_narep = get_scalar_device_view(const_cast(narep)); // Create device views from the strings columns. auto table = table_device_view::create(strings_columns,stream); @@ -75,7 +72,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, [d_table, d_narep] __device__ (size_type idx) { bool null_element = thrust::any_of( thrust::seq, d_table.begin(), d_table.end(), [idx] (auto col) { return col.is_null(idx);}); - return( !null_element || !d_narep.is_null() ); + return( !null_element || d_narep.is_valid() ); }, stream, mr ); rmm::device_buffer null_mask = valid_mask.first; auto null_count = valid_mask.second; @@ -85,12 +82,12 @@ std::unique_ptr concatenate( table_view const& strings_columns, // for this row (idx), iterate over each column and add up the bytes bool null_element = thrust::any_of( thrust::seq, d_table.begin(), d_table.end(), [row_idx] (auto const& d_column) { return d_column.is_null(row_idx);}); - if( null_element && d_narep.is_null() ) + if( null_element && !d_narep.is_valid() ) return 0; size_type bytes = thrust::transform_reduce( thrust::seq, d_table.begin(), d_table.end(), [row_idx, d_separator, d_narep] __device__ (column_device_view const& d_column) { return d_separator.size_bytes() + - (d_column.is_null(row_idx) ? d_narep.size_bytes() + (d_column.is_null(row_idx) ? d_narep.size() : d_column.element(row_idx).size_bytes()); }, 0, thrust::plus()); // separator goes only in between elements @@ -113,7 +110,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, [d_table, num_columns, d_separator, d_narep, d_results_offsets, d_results_chars] __device__(size_type idx){ bool null_element = thrust::any_of( thrust::seq, d_table.begin(), d_table.end(), [idx] (column_device_view const& col) { return col.is_null(idx);}); - if( null_element && d_narep.is_null() ) + if( null_element && !d_narep.is_valid() ) return; // do not write to buffer at all if any column element for this row is null size_type offset = d_results_offsets[idx]; char* d_buffer = d_results_chars + offset; @@ -121,7 +118,7 @@ std::unique_ptr concatenate( table_view const& strings_columns, for( size_type col_idx=0; col_idx < num_columns; ++col_idx ) { auto d_column = d_table.column(col_idx); - string_view d_str = d_column.is_null(idx) ? d_narep : d_column.element(idx); + string_view d_str = d_column.is_null(idx) ? d_narep.value() : d_column.element(idx); d_buffer = detail::copy_string(d_buffer, d_str); // separator goes only in between elements if( col_idx+1 < num_columns ) @@ -148,11 +145,7 @@ std::unique_ptr join_strings( strings_column_view const& strings, auto execpol = rmm::exec_policy(stream); string_view d_separator(separator.data(),separator.size()); - string_view const d_narep = [&narep] { - if( !narep.is_valid() ) - return string_view(nullptr,0); - return narep.size()==0 ? string_view("",0) : string_view(narep.data(),narep.size()); - } (); + auto d_narep = get_scalar_device_view(const_cast(narep)); auto strings_column = column_device_view::create(strings.parent(),stream); auto d_strings = *strings_column; @@ -169,9 +162,9 @@ std::unique_ptr join_strings( strings_column_view const& strings, size_type bytes = 0; if( d_strings.is_null(idx) ) { - if( d_narep.is_null() ) + if( !d_narep.is_valid() ) return 0; // skip nulls - bytes += d_narep.size_bytes(); + bytes += d_narep.size(); } else bytes += d_strings.element(idx).size_bytes(); @@ -197,7 +190,7 @@ std::unique_ptr join_strings( strings_column_view const& strings, // only one entry so it is either all valid or all null size_type null_count = 0; rmm::device_buffer null_mask; // init to null null-mask - if( strings.null_count()==strings_count && d_narep.is_null() ) + if( strings.null_count()==strings_count && !narep.is_valid() ) { null_mask = create_null_mask(1,cudf::mask_state::ALL_NULL,stream,mr); null_count = 1; @@ -211,9 +204,9 @@ std::unique_ptr join_strings( strings_column_view const& strings, char* d_buffer = d_chars + offset; if( d_strings.is_null(idx) ) { - if( d_narep.is_null() ) + if( !d_narep.is_valid() ) return; // do not write to buffer if element is null (including separator) - d_buffer = detail::copy_string(d_buffer, d_narep); + d_buffer = detail::copy_string(d_buffer, d_narep.value()); } else { diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu index 24efee9481e..1bd39982750 100644 --- a/cpp/src/strings/filling/fill.cu +++ b/cpp/src/strings/filling/fill.cu @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -47,11 +48,7 @@ std::unique_ptr fill( strings_column_view const& strings, return std::make_unique( strings.parent() ); // string_scalar.data() is null for valid, empty strings - string_view const d_value = [&] { - if( !value.is_valid() ) - return string_view(nullptr,0); - return value.size()==0 ? string_view("",0) : string_view(value.data(),value.size()); - } (); + auto d_value = get_scalar_device_view(const_cast(value)); auto strings_column = column_device_view::create(strings.parent(),stream); auto d_strings = *strings_column; @@ -61,19 +58,16 @@ std::unique_ptr fill( strings_column_view const& strings, thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), [d_strings, begin, end, d_value] __device__ (size_type idx) { - return ((begin <= idx) && (idx < end)) ? !d_value.is_null() : !d_strings.is_null(idx); + return ((begin <= idx) && (idx < end)) ? d_value.is_valid() : !d_strings.is_null(idx); }, stream, mr ); auto null_count = valid_mask.second; rmm::device_buffer& null_mask = valid_mask.first; // build offsets column auto offsets_transformer = [d_strings, begin, end, d_value] __device__ (size_type idx) { - if( ((begin <= idx) && (idx < end)) ? d_value.is_null() : d_strings.is_null(idx) ) + if( ((begin <= idx) && (idx < end)) ? !d_value.is_valid() : d_strings.is_null(idx) ) return 0; - int32_t bytes = d_value.size_bytes(); - if( (idx < begin) || (idx >= end) ) - bytes = d_strings.element(idx).size_bytes(); - return bytes; + return ((begin <= idx) && (idx < end)) ? d_value.size() : d_strings.element(idx).size_bytes(); }; auto offsets_transformer_itr = thrust::make_transform_iterator( thrust::make_counting_iterator(0), offsets_transformer ); auto offsets_column = detail::make_offsets_child_column(offsets_transformer_itr, @@ -88,11 +82,10 @@ std::unique_ptr fill( strings_column_view const& strings, auto d_chars = chars_column->mutable_view().data(); thrust::for_each_n(rmm::exec_policy(stream)->on(stream), thrust::make_counting_iterator(0), strings_count, [d_strings, begin, end, d_value, d_offsets, d_chars] __device__(size_type idx){ - if( ((begin <= idx) && (idx < end)) ? d_value.is_null() : d_strings.is_null(idx) ) + if( ((begin <= idx) && (idx < end)) ? !d_value.is_valid() : d_strings.is_null(idx) ) return; - string_view d_str = d_value; - if( (idx < begin) || (idx >= end) ) - d_str = d_strings.element(idx); + string_view const d_str = ((begin <= idx) && (idx < end)) ? d_value.value() + : d_strings.element(idx); memcpy( d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes() ); }); From 2ebee8d35d4371211fa95631a4c8f9ce90e32a27 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Tue, 17 Mar 2020 14:44:42 -0400 Subject: [PATCH 5/9] comment out workaround logic --- python/cudf/cudf/core/column/string.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 4c7bea5f515..bb65db699ba 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1910,13 +1910,13 @@ def __init__(self, mask=None, size=None, offset=0, children=()): # For an "all empty" StringColumn (e.g., [""]) libcudf still # needs the chars child column pointer to be non-null: - if self.size: - if self.children[1].size == 0 and self.null_count != self.size: - offsets = self.base_children[0] - chars = column_empty( - self.base_children[1].size + 1, dtype="int8" - ) - self.set_base_children((offsets, chars)) + #if self.size: + # if self.children[1].size == 0 and self.null_count != self.size: + # offsets = self.base_children[0] + # chars = column_empty( + # self.base_children[1].size + 1, dtype="int8" + # ) + # self.set_base_children((offsets, chars)) # TODO: Remove these once NVStrings is fully deprecated / removed self._nvstrings = None From a3a8fbaf502402812342ee275125fe416ee9bc1b Mon Sep 17 00:00:00 2001 From: davidwendt Date: Fri, 20 Mar 2020 12:16:52 -0400 Subject: [PATCH 6/9] update changelog --- CHANGELOG.md | 1 + python/cudf/cudf/core/column/string.py | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e185033f65..d88ad64d4d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ - PR #4073 Enable contiguous split java test - PR #4527 Add JNI and java bindings for matches_re - PR #4599 Add Java and JNI bindings for string replace +- PR #4548 Remove string_view is_null method ## Bug Fixes diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 33c4bb19d65..603abf82993 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1913,13 +1913,13 @@ def __init__(self, mask=None, size=None, offset=0, children=()): # For an "all empty" StringColumn (e.g., [""]) libcudf still # needs the chars child column pointer to be non-null: - #if self.size: - # if self.children[1].size == 0 and self.null_count != self.size: - # offsets = self.base_children[0] - # chars = column_empty( - # self.base_children[1].size + 1, dtype="int8" - # ) - # self.set_base_children((offsets, chars)) + # if self.size: + # if self.children[1].size == 0 and self.null_count != self.size: + # offsets = self.base_children[0] + # chars = column_empty( + # self.base_children[1].size + 1, dtype="int8" + # ) + # self.set_base_children((offsets, chars)) # TODO: Remove these once NVStrings is fully deprecated / removed self._nvstrings = None From f84c8984f08219fe7b089983799577ba3a220a10 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Fri, 20 Mar 2020 15:07:05 -0400 Subject: [PATCH 7/9] remove obsoleted ifs in string_view::compare --- cpp/include/cudf/strings/string_view.inl | 6 ------ 1 file changed, 6 deletions(-) diff --git a/cpp/include/cudf/strings/string_view.inl b/cpp/include/cudf/strings/string_view.inl index d6e9dd9e0fe..6fcafc07f97 100644 --- a/cpp/include/cudf/strings/string_view.inl +++ b/cpp/include/cudf/strings/string_view.inl @@ -257,14 +257,8 @@ __device__ inline int string_view::compare(const string_view& in) const __device__ inline int string_view::compare(const char* data, size_type bytes) const { size_type const len1 = size_bytes(); - if( len1==0 && bytes==0 ) - return 0; const unsigned char* ptr1 = reinterpret_cast(this->data()); - if(!ptr1) - return -1; const unsigned char* ptr2 = reinterpret_cast(data); - if(!ptr2) - return 1; size_type idx = 0; for(; (idx < len1) && (idx < bytes); ++idx) { From 7a0c1319f7e4f226cee188ed422a8f98aefba2ce Mon Sep 17 00:00:00 2001 From: davidwendt Date: Fri, 20 Mar 2020 18:18:33 -0400 Subject: [PATCH 8/9] remove workaround code for 1-byte empty-strings column --- python/cudf/cudf/core/column/string.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 603abf82993..a3d11d1ce20 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1911,16 +1911,6 @@ def __init__(self, mask=None, size=None, offset=0, children=()): None, size, dtype, mask=mask, offset=offset, children=children ) - # For an "all empty" StringColumn (e.g., [""]) libcudf still - # needs the chars child column pointer to be non-null: - # if self.size: - # if self.children[1].size == 0 and self.null_count != self.size: - # offsets = self.base_children[0] - # chars = column_empty( - # self.base_children[1].size + 1, dtype="int8" - # ) - # self.set_base_children((offsets, chars)) - # TODO: Remove these once NVStrings is fully deprecated / removed self._nvstrings = None self._nvcategory = None From 089102cc00acd9bb481eeb4fd71f7e647805b681 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Sun, 22 Mar 2020 17:56:54 -0400 Subject: [PATCH 9/9] remove commented out code --- cpp/src/strings/utilities.cu | 8 -------- 1 file changed, 8 deletions(-) diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index cb12293b080..ef1dd575c48 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -99,8 +99,6 @@ std::unique_ptr child_chars_from_string_vector( auto d_strings = strings.data().get(); auto execpol = rmm::exec_policy(stream); size_type bytes = thrust::device_pointer_cast(d_offsets)[count]; -// if( (bytes==0) && (null_count < count) ) -// bytes = 1; // all entries are empty strings // create column auto chars_column = make_numeric_column( data_type{INT8}, bytes, @@ -125,12 +123,6 @@ std::unique_ptr create_chars_child_column( cudf::size_type strings_count cudaStream_t stream) { CUDF_EXPECTS(null_count <= strings_count, "Invalid null count"); - // If we have all nulls, a null chars column is allowed. - // If all non-null strings are empty strings, we need a non-null chars column. - // In this case we set the bytes to 1 to create a minimal one-byte chars column. -// if( (total_bytes==0) && (null_count < strings_count) ) -// total_bytes = 1; // all entries are empty strings (not nulls) - // return chars column return make_numeric_column( data_type{INT8}, total_bytes, mask_state::UNALLOCATED, stream, mr ); }