diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d63c7e75616..03cf4c7d2b7 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -753,7 +753,10 @@ if(CUDF_BUILD_BENCHMARKS) include(${rapids-cmake-dir}/cpm/gbench.cmake) rapids_cpm_gbench() - # Find or install NVBench + # Find or install NVBench Temporarily force downloading of fmt because current versions of nvbench + # do not support the latest version of fmt, which is automatically pulled into our conda + # environments by mamba. + set(CPM_DOWNLOAD_fmt TRUE) include(${rapids-cmake-dir}/cpm/nvbench.cmake) rapids_cpm_nvbench() add_subdirectory(benchmarks) diff --git a/cpp/benchmarks/string/json.cu b/cpp/benchmarks/string/json.cu index 87528608cc7..d7c0066eb33 100644 --- a/cpp/benchmarks/string/json.cu +++ b/cpp/benchmarks/string/json.cu @@ -177,8 +177,8 @@ auto build_json_string_column(int desired_bytes, int num_rows) auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2)); json_benchmark_row_builder jb{ desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order}; - auto children = - cudf::strings::detail::make_strings_children(jb, num_rows, cudf::get_default_stream()); + auto children = cudf::strings::detail::make_strings_children( + jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); return cudf::make_strings_column( num_rows, std::move(children.first), std::move(children.second), 0, {}); } diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp index ade28faf645..3b8ed0f4e0d 100644 --- a/cpp/include/cudf/strings/detail/combine.hpp +++ b/cpp/include/cudf/strings/detail/combine.hpp @@ -34,14 +34,12 @@ namespace detail { * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr concatenate( - table_view const& strings_columns, - string_scalar const& separator, - string_scalar const& narep, - separator_on_nulls separate_nulls = separator_on_nulls::YES, - // Move before separate_nulls? - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr concatenate(table_view const& strings_columns, + string_scalar const& separator, + string_scalar const& narep, + separator_on_nulls separate_nulls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc join_strings(table_view const&,string_scalar const&,string_scalar @@ -49,12 +47,11 @@ std::unique_ptr concatenate( * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr join_strings( - strings_column_view const& strings, - string_scalar const& separator, - string_scalar const& narep, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr join_strings(strings_column_view const& strings, + string_scalar const& separator, + string_scalar const& narep, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc join_list_elements(table_view const&,string_scalar const&,string_scalar diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp index caaeb2afbe7..511e240886a 100644 --- a/cpp/include/cudf/strings/detail/concatenate.hpp +++ b/cpp/include/cudf/strings/detail/concatenate.hpp @@ -42,10 +42,9 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory. * @return New column with concatenated results. */ -std::unique_ptr concatenate( - host_span columns, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr concatenate(host_span columns, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace strings diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh index 79cec779e02..374c3b2cf68 100644 --- a/cpp/include/cudf/strings/detail/copy_if_else.cuh +++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh @@ -56,13 +56,12 @@ namespace detail { * @return New strings column. */ template -std::unique_ptr copy_if_else( - StringIterLeft lhs_begin, - StringIterLeft lhs_end, - StringIterRight rhs_begin, - Filter filter_fn, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr copy_if_else(StringIterLeft lhs_begin, + StringIterLeft lhs_end, + StringIterRight rhs_begin, + Filter filter_fn, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_count = std::distance(lhs_begin, lhs_end); if (strings_count == 0) return make_empty_column(type_id::STRING); diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh index e83f6dc0005..ee09ce9a7a9 100644 --- a/cpp/include/cudf/strings/detail/copy_range.cuh +++ b/cpp/include/cudf/strings/detail/copy_range.cuh @@ -99,14 +99,13 @@ namespace detail { * @return std::unique_ptr The result target column */ template -std::unique_ptr copy_range( - SourceValueIterator source_value_begin, - SourceValidityIterator source_validity_begin, - strings_column_view const& target, - size_type target_begin, - size_type target_end, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr copy_range(SourceValueIterator source_value_begin, + SourceValidityIterator source_validity_begin, + strings_column_view const& target, + size_type target_begin, + size_type target_end, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS( (target_begin >= 0) && (target_begin < target.size()) && (target_end <= target.size()), diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp index c70952b0962..7e82ad4c679 100644 --- a/cpp/include/cudf/strings/detail/copying.hpp +++ b/cpp/include/cudf/strings/detail/copying.hpp @@ -49,13 +49,11 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column of size (end-start)/step. */ -std::unique_ptr copy_slice( - strings_column_view const& strings, - size_type start, - size_type end = -1, - // Move before end? - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr copy_slice(strings_column_view const& strings, + size_type start, + size_type end, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Returns a new strings column created by shifting the rows by a specified offset. diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp index 1ad9663a614..43e3f6198f3 100644 --- a/cpp/include/cudf/strings/detail/fill.hpp +++ b/cpp/include/cudf/strings/detail/fill.hpp @@ -42,13 +42,12 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ -std::unique_ptr fill( - strings_column_view const& strings, - size_type begin, - size_type end, - string_scalar const& value, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr fill(strings_column_view const& strings, + size_type begin, + size_type end, + string_scalar const& value, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace strings diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index dfc8f0dacc5..4820e6e77c7 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -288,12 +288,11 @@ std::unique_ptr gather_chars(StringIterator strings_begin, * @return New strings column containing the gathered strings. */ template -std::unique_ptr gather( - strings_column_view const& strings, - MapIterator begin, - MapIterator end, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr gather(strings_column_view const& strings, + MapIterator begin, + MapIterator end, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const output_count = std::distance(begin, end); auto const strings_count = strings.size(); @@ -372,13 +371,12 @@ std::unique_ptr gather( * @return New strings column containing the gathered strings. */ template -std::unique_ptr gather( - strings_column_view const& strings, - MapIterator begin, - MapIterator end, - bool nullify_out_of_bounds, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr gather(strings_column_view const& strings, + MapIterator begin, + MapIterator end, + bool nullify_out_of_bounds, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (nullify_out_of_bounds) return gather(strings, begin, end, stream, mr); return gather(strings, begin, end, stream, mr); diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp index 8ea579ae5c0..0fb06d36570 100644 --- a/cpp/include/cudf/strings/detail/json.hpp +++ b/cpp/include/cudf/strings/detail/json.hpp @@ -16,6 +16,8 @@ #pragma once +#include +#include #include #include @@ -30,12 +32,11 @@ namespace detail { * * @param stream CUDA stream used for device memory operations and kernel launches */ -std::unique_ptr get_json_object( - cudf::strings_column_view const& col, - cudf::string_scalar const& json_path, - get_json_object_options options, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr get_json_object(cudf::strings_column_view const& col, + cudf::string_scalar const& json_path, + cudf::strings::get_json_object_options options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace strings diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp index a9a6ef00103..aa6fb2feb3d 100644 --- a/cpp/include/cudf/strings/detail/replace.hpp +++ b/cpp/include/cudf/strings/detail/replace.hpp @@ -43,14 +43,12 @@ enum class replace_algorithm { * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ template -std::unique_ptr replace( - strings_column_view const& strings, - string_scalar const& target, - string_scalar const& repl, - int32_t maxrepl = -1, - // Move before maxrepl? - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr replace(strings_column_view const& strings, + string_scalar const& target, + string_scalar const& repl, + int32_t maxrepl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc cudf::strings::replace_slice(strings_column_view const&, string_scalar const&, @@ -58,14 +56,12 @@ std::unique_ptr replace( * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr replace_slice( - strings_column_view const& strings, - string_scalar const& repl = string_scalar(""), - size_type start = 0, - size_type stop = -1, - // Move before repl? - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr replace_slice(strings_column_view const& strings, + string_scalar const& repl, + size_type start, + size_type stop, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @copydoc cudf::strings::replace(strings_column_view const&, strings_column_view const&, @@ -73,12 +69,11 @@ std::unique_ptr replace_slice( * * @param[in] stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr replace( - strings_column_view const& strings, - strings_column_view const& targets, - strings_column_view const& repls, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr replace(strings_column_view const& strings, + strings_column_view const& targets, + strings_column_view const& repls, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Replaces any null string entries with the given string. @@ -98,12 +93,10 @@ std::unique_ptr replace( * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ -std::unique_ptr replace_nulls( - strings_column_view const& strings, - string_scalar const& repl = string_scalar(""), - // Move before repl? - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr replace_nulls(strings_column_view const& strings, + string_scalar const& repl, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace strings diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh index c8a90ea538a..7d6a07b4b10 100644 --- a/cpp/include/cudf/strings/detail/scatter.cuh +++ b/cpp/include/cudf/strings/detail/scatter.cuh @@ -57,18 +57,18 @@ namespace detail { * @return New strings column. */ template -std::unique_ptr scatter( - SourceIterator begin, - SourceIterator end, - MapIterator scatter_map, - strings_column_view const& target, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr scatter(SourceIterator begin, + SourceIterator end, + MapIterator scatter_map, + strings_column_view const& target, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (target.is_empty()) return make_empty_column(type_id::STRING); // create vector of string_view's to scatter into - rmm::device_uvector target_vector = create_string_vector_from_column(target, stream); + rmm::device_uvector target_vector = + create_string_vector_from_column(target, stream, rmm::mr::get_current_device_resource()); // this ensures empty strings are not mapped to nulls in the make_strings_column function auto const size = thrust::distance(begin, end); diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh index 9404ac14775..76e5f931981 100644 --- a/cpp/include/cudf/strings/detail/utilities.cuh +++ b/cpp/include/cudf/strings/detail/utilities.cuh @@ -50,11 +50,10 @@ namespace detail { * @return offsets child column for strings column */ template -std::unique_ptr make_offsets_child_column( - InputIterator begin, - InputIterator end, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr make_offsets_child_column(InputIterator begin, + InputIterator end, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(begin < end, "Invalid iterator range"); auto count = thrust::distance(begin, end); @@ -117,12 +116,11 @@ __device__ inline char* copy_string(char* buffer, const string_view& d_string) * @return offsets child column and chars child column for a strings column */ template -auto make_strings_children( - SizeAndExecuteFunction size_and_exec_fn, - size_type exec_size, - size_type strings_count, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, + size_type exec_size, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto offsets_column = make_numeric_column( data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); @@ -175,11 +173,10 @@ auto make_strings_children( * @return offsets child column and chars child column for a strings column */ template -auto make_strings_children( - SizeAndExecuteFunction size_and_exec_fn, - size_type strings_count, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr); } diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index 829e0207110..41a2654dce3 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -36,10 +36,9 @@ namespace detail { * @param mr Device memory resource used to allocate the returned column's device memory. * @return The chars child column for a strings column. */ -std::unique_ptr create_chars_child_column( - size_type bytes, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr create_chars_child_column(size_type bytes, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); /** * @brief Creates a string_view vector from a strings column. @@ -52,7 +51,7 @@ std::unique_ptr create_chars_child_column( rmm::device_uvector create_string_vector_from_column( cudf::strings_column_view const strings, rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace strings diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index fd794b2e66c..20912e9209f 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -398,7 +398,7 @@ bool is_compression_enabled(compression_type compression) return false; case compression_type::SNAPPY: return detail::nvcomp_integration::is_stable_enabled(); case compression_type::ZSTD: - return NVCOMP_HAS_ZSTD_COMP and detail::nvcomp_integration::is_all_enabled(); + return NVCOMP_HAS_ZSTD_COMP and detail::nvcomp_integration::is_stable_enabled(); default: return false; } return false; diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 7230b455d4a..ed2f412f291 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -364,8 +364,11 @@ void write_chunked(data_sink* out_sink, CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column."); cudf::string_scalar newline{options.get_line_terminator()}; - auto p_str_col_w_nl = - cudf::strings::detail::join_strings(str_column_view, newline, string_scalar("", false), stream); + auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view, + newline, + string_scalar("", false), + stream, + rmm::mr::get_current_device_resource()); strings_column_view strings_column{p_str_col_w_nl->view()}; auto total_num_bytes = strings_column.chars_size(); @@ -470,9 +473,11 @@ void write_csv(data_sink* out_sink, delimiter_str, options.get_na_rep(), strings::separator_on_nulls::YES, - stream); + stream, + rmm::mr::get_current_device_resource()); cudf::string_scalar narep{options.get_na_rep()}; - return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream); + return cudf::strings::detail::replace_nulls( + str_table_view.column(0), narep, stream, rmm::mr::get_current_device_resource()); }(); write_chunked(out_sink, str_concat_col->view(), options, stream, mr); diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index cee023a1061..0ac3efb407e 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -525,14 +525,15 @@ void make_device_json_column(device_span input, auto parent_node_id = ordered_parent_node_ids[i]; if (parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST) { // unique item - if (i == 0 || + if (i == 0 or (col_ids[i - 1] != col_ids[i] or ordered_parent_node_ids[i - 1] != parent_node_id)) { // scatter to list_offset d_columns_data[original_col_ids[parent_node_id]] .child_offsets[row_offsets[parent_node_id]] = ordered_row_offsets[i]; } // TODO: verify if this code is right. check with more test cases. - if (i == num_nodes - 1 || (col_ids[i] != col_ids[i + 1])) { + if (i == num_nodes - 1 or + (col_ids[i] != col_ids[i + 1] or ordered_parent_node_ids[i + 1] != parent_node_id)) { // last value of list child_offset is its size. d_columns_data[original_col_ids[parent_node_id]] .child_offsets[row_offsets[parent_node_id] + 1] = ordered_row_offsets[i] + 1; diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu index ea01b570b91..127d3aa8fe7 100644 --- a/cpp/src/strings/attributes.cu +++ b/cpp/src/strings/attributes.cu @@ -87,19 +87,17 @@ std::unique_ptr counts_fn(strings_column_view const& strings, } // namespace -std::unique_ptr count_characters( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr count_characters(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto ufn = [] __device__(const string_view& d_str) { return d_str.length(); }; return counts_fn(strings, ufn, stream, mr); } -std::unique_ptr count_bytes( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr count_bytes(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto ufn = [] __device__(const string_view& d_str) { return d_str.size_bytes(); }; return counts_fn(strings, ufn, stream, mr); @@ -135,10 +133,9 @@ struct code_points_fn { namespace detail { // -std::unique_ptr code_points( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr code_points(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 05c2904ec9e..a2cee757112 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -147,30 +147,27 @@ std::unique_ptr convert_case(strings_column_view const& strings, } // namespace -std::unique_ptr to_lower( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr to_lower(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { character_flags_table_type case_flag = IS_UPPER(0xFF); // convert only upper case characters return convert_case(strings, case_flag, stream, mr); } // -std::unique_ptr to_upper( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr to_upper(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { character_flags_table_type case_flag = IS_LOWER(0xFF); // convert only lower case characters return convert_case(strings, case_flag, stream, mr); } // -std::unique_ptr swapcase( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr swapcase(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // convert only upper or lower case characters character_flags_table_type case_flag = IS_LOWER(0xFF) | IS_UPPER(0xFF); diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index 0426d82c6c6..aa1e4dce4d0 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -38,12 +38,11 @@ namespace cudf { namespace strings { namespace detail { // -std::unique_ptr all_characters_of_type( - strings_column_view const& strings, - string_character_types types, - string_character_types verify_types, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr all_characters_of_type(strings_column_view const& strings, + string_character_types types, + string_character_types verify_types, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_count = strings.size(); auto strings_column = column_device_view::create(strings.parent(), stream); diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu index c6e71b00809..80941990610 100644 --- a/cpp/src/strings/contains.cu +++ b/cpp/src/strings/contains.cu @@ -86,32 +86,29 @@ std::unique_ptr contains_impl(strings_column_view const& input, } // namespace -std::unique_ptr contains_re( - strings_column_view const& input, - std::string_view pattern, - regex_flags const flags, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr contains_re(strings_column_view const& input, + std::string_view pattern, + regex_flags const flags, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return contains_impl(input, pattern, flags, false, stream, mr); } -std::unique_ptr matches_re( - strings_column_view const& input, - std::string_view pattern, - regex_flags const flags, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr matches_re(strings_column_view const& input, + std::string_view pattern, + regex_flags const flags, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { return contains_impl(input, pattern, flags, true, stream, mr); } -std::unique_ptr count_re( - strings_column_view const& input, - std::string_view pattern, - regex_flags const flags, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr count_re(strings_column_view const& input, + std::string_view pattern, + regex_flags const flags, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // compile regex into device object auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream); diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index 2de4bd2a2cc..49713731ff5 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -454,10 +454,9 @@ std::unique_ptr from_floats(column_view const& floats, rmm::mr::device_m } namespace detail { -std::unique_ptr is_float( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr is_float(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_column = column_device_view::create(strings.parent(), stream); auto d_column = *strings_column; diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu index dbbdffac2c2..f41232a4af6 100644 --- a/cpp/src/strings/convert/convert_hex.cu +++ b/cpp/src/strings/convert/convert_hex.cu @@ -206,11 +206,10 @@ struct dispatch_integers_to_hex_fn { } // namespace // This will convert a strings column into any integer column type. -std::unique_ptr hex_to_integers( - strings_column_view const& strings, - data_type output_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr hex_to_integers(strings_column_view const& strings, + data_type output_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); if (strings_count == 0) return make_empty_column(output_type); diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 343288af0c1..ed40c47b99d 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -157,10 +157,9 @@ struct dispatch_is_integer_fn { } // namespace -std::unique_ptr is_integer( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr is_integer(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const d_column = column_device_view::create(strings.parent(), stream); auto results = make_numeric_column(data_type{type_id::BOOL8}, @@ -192,11 +191,10 @@ std::unique_ptr is_integer( return results; } -std::unique_ptr is_integer( - strings_column_view const& strings, - data_type int_type, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr is_integer(strings_column_view const& strings, + data_type int_type, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (strings.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); } return type_dispatcher(int_type, dispatch_is_integer_fn{}, strings, stream, mr); diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu index 5229f0fdf1b..0dcb2b61446 100644 --- a/cpp/src/strings/convert/convert_ipv4.cu +++ b/cpp/src/strings/convert/convert_ipv4.cu @@ -75,10 +75,9 @@ struct ipv4_to_integers_fn { } // namespace // Convert strings column of IPv4 addresses to integers column -std::unique_ptr ipv4_to_integers( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr ipv4_to_integers(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); if (strings_count == 0) return make_numeric_column(data_type{type_id::INT64}, 0); @@ -162,10 +161,9 @@ struct integers_to_ipv4_fn { } // namespace // Convert integers into IPv4 addresses -std::unique_ptr integers_to_ipv4( - column_view const& integers, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr integers_to_ipv4(column_view const& integers, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type strings_count = integers.size(); if (strings_count == 0) return make_empty_column(type_id::STRING); diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index 0c6ecf46313..25e37526f59 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -129,10 +129,9 @@ struct url_encoder_fn { } // namespace // -std::unique_ptr url_encode( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr url_encode(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); if (strings_count == 0) return make_empty_column(type_id::STRING); @@ -388,10 +387,9 @@ __global__ void url_decode_char_replacer(column_device_view const in_strings, } // namespace // -std::unique_ptr url_decode( - strings_column_view const& strings, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr url_decode(strings_column_view const& strings, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); if (strings_count == 0) return make_empty_column(type_id::STRING); diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp index d4bcdaf4042..a4f76c1c5e3 100644 --- a/cpp/src/strings/count_matches.hpp +++ b/cpp/src/strings/count_matches.hpp @@ -41,12 +41,11 @@ class reprog_device; * @param mr Device memory resource used to allocate the returned column's device memory. * @return Integer column of match counts */ -std::unique_ptr count_matches( - column_device_view const& d_strings, - reprog_device& d_prog, - size_type output_size, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +std::unique_ptr count_matches(column_device_view const& d_strings, + reprog_device& d_prog, + size_type output_size, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace strings diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu index 1ba5a8a1470..e669d2178a2 100644 --- a/cpp/src/strings/extract/extract_all.cu +++ b/cpp/src/strings/extract/extract_all.cu @@ -95,12 +95,11 @@ struct extract_fn { * * @param stream CUDA stream used for device memory operations and kernel launches. */ -std::unique_ptr extract_all_record( - strings_column_view const& input, - std::string_view pattern, - regex_flags const flags, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr extract_all_record(strings_column_view const& input, + std::string_view pattern, + regex_flags const flags, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const strings_count = input.size(); auto const d_strings = column_device_view::create(input.parent(), stream); diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu index f813ec24ee9..4bd98ee4cdc 100644 --- a/cpp/src/strings/filling/fill.cu +++ b/cpp/src/strings/filling/fill.cu @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -35,13 +35,12 @@ namespace cudf { namespace strings { namespace detail { -std::unique_ptr fill( - strings_column_view const& strings, - size_type begin, - size_type end, - string_scalar const& value, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr fill(strings_column_view const& strings, + size_type begin, + size_type end, + string_scalar const& value, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto strings_count = strings.size(); if (strings_count == 0) return make_empty_column(type_id::STRING); diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu index cb6fc844426..4e4df6cb1ad 100644 --- a/cpp/src/strings/like.cu +++ b/cpp/src/strings/like.cu @@ -102,12 +102,11 @@ struct like_fn { } // namespace -std::unique_ptr like( - strings_column_view const& input, - string_scalar const& pattern, - string_scalar const& escape_character, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr like(strings_column_view const& input, + string_scalar const& pattern, + string_scalar const& escape_character, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto results = make_numeric_column(data_type{type_id::BOOL8}, input.size(), diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index d84b4afc7cf..e5497849681 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -53,13 +53,12 @@ struct compute_pad_output_length_fn { } // namespace -std::unique_ptr pad( - strings_column_view const& strings, - size_type width, - side_type side = side_type::RIGHT, - std::string_view fill_char = " ", - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr pad(strings_column_view const& strings, + size_type width, + side_type side, + std::string_view fill_char, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); if (strings_count == 0) return make_empty_column(type_id::STRING); @@ -128,11 +127,10 @@ std::unique_ptr pad( std::move(null_mask)); } -std::unique_ptr zfill( - strings_column_view const& input, - size_type width, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr zfill(strings_column_view const& input, + size_type width, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return make_empty_column(type_id::STRING); diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index cc5cf1384ec..f15496ac159 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -125,13 +125,12 @@ struct replace_multi_regex_fn { } // namespace -std::unique_ptr replace_re( - strings_column_view const& input, - std::vector const& patterns, - strings_column_view const& replacements, - regex_flags const flags, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr replace_re(strings_column_view const& input, + std::vector const& patterns, + strings_column_view const& replacements, + regex_flags const flags, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) { return make_empty_column(type_id::STRING); } if (patterns.empty()) { // if no patterns; just return a copy diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index 04cb074c016..e9cc60f1d64 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -100,14 +100,13 @@ struct replace_regex_fn { } // namespace // -std::unique_ptr replace_re( - strings_column_view const& input, - std::string_view pattern, - string_scalar const& replacement, - std::optional max_replace_count, - regex_flags const flags, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr replace_re(strings_column_view const& input, + std::string_view pattern, + string_scalar const& replacement, + std::optional max_replace_count, + regex_flags const flags, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return make_empty_column(type_id::STRING); diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index c48aedc5499..e6384d5d6e1 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -102,13 +102,12 @@ std::unique_ptr find_fn(strings_column_view const& strings, } // namespace -std::unique_ptr find( - strings_column_view const& strings, - string_scalar const& target, - size_type start = 0, - size_type stop = -1, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr find(strings_column_view const& strings, + string_scalar const& target, + size_type start, + size_type stop, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto pfn = [] __device__( string_view d_string, string_view d_target, size_type start, size_type stop) { @@ -122,13 +121,12 @@ std::unique_ptr find( return find_fn(strings, target, start, stop, pfn, stream, mr); } -std::unique_ptr rfind( - strings_column_view const& strings, - string_scalar const& target, - size_type start = 0, - size_type stop = -1, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr rfind(strings_column_view const& strings, + string_scalar const& target, + size_type start, + size_type stop, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto pfn = [] __device__( string_view d_string, string_view d_target, size_type start, size_type stop) { @@ -366,11 +364,10 @@ std::unique_ptr contains_fn(strings_column_view const& strings, } // namespace -std::unique_ptr contains( - strings_column_view const& input, - string_scalar const& target, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr contains(strings_column_view const& input, + string_scalar const& target, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // use warp parallel when the average string width is greater than the threshold if (!input.is_empty() && ((input.chars_size() / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) { @@ -384,11 +381,10 @@ std::unique_ptr contains( return contains_fn(input, target, pfn, stream, mr); } -std::unique_ptr contains( - strings_column_view const& strings, - strings_column_view const& targets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr contains(strings_column_view const& strings, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto pfn = [] __device__(string_view d_string, string_view d_target) { return d_string.find(d_target) != string_view::npos; @@ -396,11 +392,10 @@ std::unique_ptr contains( return contains_fn(strings, targets, pfn, stream, mr); } -std::unique_ptr starts_with( - strings_column_view const& strings, - string_scalar const& target, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr starts_with(strings_column_view const& strings, + string_scalar const& target, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto pfn = [] __device__(string_view d_string, string_view d_target) { return (d_target.size_bytes() <= d_string.size_bytes()) && @@ -409,11 +404,10 @@ std::unique_ptr starts_with( return contains_fn(strings, target, pfn, stream, mr); } -std::unique_ptr starts_with( - strings_column_view const& strings, - strings_column_view const& targets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr starts_with(strings_column_view const& strings, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto pfn = [] __device__(string_view d_string, string_view d_target) { return (d_target.size_bytes() <= d_string.size_bytes()) && @@ -422,11 +416,10 @@ std::unique_ptr starts_with( return contains_fn(strings, targets, pfn, stream, mr); } -std::unique_ptr ends_with( - strings_column_view const& strings, - string_scalar const& target, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr ends_with(strings_column_view const& strings, + string_scalar const& target, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto pfn = [] __device__(string_view d_string, string_view d_target) { auto const str_size = d_string.size_bytes(); @@ -438,11 +431,10 @@ std::unique_ptr ends_with( return contains_fn(strings, target, pfn, stream, mr); } -std::unique_ptr ends_with( - strings_column_view const& strings, - strings_column_view const& targets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr ends_with(strings_column_view const& strings, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto pfn = [] __device__(string_view d_string, string_view d_target) { auto const str_size = d_string.size_bytes(); diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu index 389e6eccc43..1907c0d749b 100644 --- a/cpp/src/strings/search/find_multiple.cu +++ b/cpp/src/strings/search/find_multiple.cu @@ -34,11 +34,10 @@ namespace cudf { namespace strings { namespace detail { -std::unique_ptr find_multiple( - strings_column_view const& input, - strings_column_view const& targets, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr find_multiple(strings_column_view const& input, + strings_column_view const& targets, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const strings_count = input.size(); auto const targets_count = targets.size(); diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu index 07829581aa6..b5b8cab65a7 100644 --- a/cpp/src/strings/search/findall.cu +++ b/cpp/src/strings/search/findall.cu @@ -92,12 +92,11 @@ std::unique_ptr findall_util(column_device_view const& d_strings, } // namespace // -std::unique_ptr findall( - strings_column_view const& input, - std::string_view pattern, - regex_flags const flags, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr findall(strings_column_view const& input, + std::string_view pattern, + regex_flags const flags, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { auto const strings_count = input.size(); auto const d_strings = column_device_view::create(input.parent(), stream); diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu index acdd9efbb45..09aadb78554 100644 --- a/cpp/src/strings/split/partition.cu +++ b/cpp/src/strings/split/partition.cu @@ -181,11 +181,10 @@ struct rpartition_fn : public partition_fn { } // namespace -std::unique_ptr partition( - strings_column_view const& strings, - string_scalar const& delimiter = string_scalar(""), - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr
partition(strings_column_view const& strings, + string_scalar const& delimiter, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); auto strings_count = strings.size(); @@ -209,11 +208,10 @@ std::unique_ptr
partition( return std::make_unique
(std::move(results)); } -std::unique_ptr
rpartition( - strings_column_view const& strings, - string_scalar const& delimiter = string_scalar(""), - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr
rpartition(strings_column_view const& strings, + string_scalar const& delimiter, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); auto strings_count = strings.size(); diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index 89b4c1d75c2..c11d7ad47f9 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -791,12 +791,11 @@ std::unique_ptr
whitespace_split_fn(size_type strings_count, } // namespace -std::unique_ptr
split( - strings_column_view const& strings_column, - string_scalar const& delimiter = string_scalar(""), - size_type maxsplit = -1, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr
split(strings_column_view const& strings_column, + string_scalar const& delimiter, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); @@ -816,12 +815,11 @@ std::unique_ptr
split( strings_column, split_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens}, stream, mr); } -std::unique_ptr
rsplit( - strings_column_view const& strings_column, - string_scalar const& delimiter = string_scalar(""), - size_type maxsplit = -1, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr
rsplit(strings_column_view const& strings_column, + string_scalar const& delimiter, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 2538bab6229..a17c0a575fb 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -205,7 +205,8 @@ std::unique_ptr
split_re(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); // count the number of delimiters matched in each string - auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream); + auto offsets = count_matches( + *d_strings, *d_prog, strings_count + 1, stream, rmm::mr::get_current_device_resource()); auto offsets_view = offsets->mutable_view(); auto d_offsets = offsets_view.data(); diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index 83d8d7f9203..d935ad0b1da 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -264,12 +264,11 @@ std::unique_ptr split_record_fn(strings_column_view const& strings, } template -std::unique_ptr split_record( - strings_column_view const& strings, - string_scalar const& delimiter = string_scalar(""), - size_type maxsplit = -1, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr split_record(strings_column_view const& strings, + string_scalar const& delimiter, + size_type maxsplit, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid"); diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index ca30eb3f6d8..2159b67774e 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -56,13 +56,12 @@ std::unique_ptr make_strings_column( return cudf::strings::detail::make_strings_column(strings.begin(), strings.end(), stream, mr); } -std::unique_ptr make_strings_column( - device_span chars, - device_span offsets, - size_type null_count, - rmm::device_buffer&& null_mask, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr make_strings_column(device_span chars, + device_span offsets, + size_type null_count, + rmm::device_buffer&& null_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu index e982050b8d6..6fb7c671a87 100644 --- a/cpp/src/strings/strip.cu +++ b/cpp/src/strings/strip.cu @@ -56,12 +56,11 @@ struct strip_transform_fn { } // namespace -std::unique_ptr strip( - strings_column_view const& input, - side_type side = side_type::BOTH, - string_scalar const& to_strip = string_scalar(""), - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr strip(strings_column_view const& input, + side_type side, + string_scalar const& to_strip, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (input.is_empty()) return make_empty_column(type_id::STRING); diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu index e0d1bc8cf31..2acc834a1cb 100644 --- a/cpp/src/strings/substring.cu +++ b/cpp/src/strings/substring.cu @@ -105,13 +105,12 @@ struct substring_fn { } // namespace // -std::unique_ptr slice_strings( - strings_column_view const& strings, - numeric_scalar const& start = numeric_scalar(0, false), - numeric_scalar const& stop = numeric_scalar(0, false), - numeric_scalar const& step = numeric_scalar(1), - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr slice_strings(strings_column_view const& strings, + numeric_scalar const& start, + numeric_scalar const& stop, + numeric_scalar const& step, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (strings.is_empty()) return make_empty_column(type_id::STRING); @@ -291,12 +290,11 @@ void compute_substring_indices(column_device_view const& d_column, } // namespace // -std::unique_ptr slice_strings( - strings_column_view const& strings, - column_view const& starts_column, - column_view const& stops_column, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr slice_strings(strings_column_view const& strings, + column_view const& starts_column, + column_view const& stops_column, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { size_type strings_count = strings.size(); if (strings_count == 0) return make_empty_column(type_id::STRING); diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index 01ecc49f10a..5b23b092cce 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -86,11 +86,10 @@ struct translate_fn { } // namespace // -std::unique_ptr translate( - strings_column_view const& strings, - std::vector> const& chars_table, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr translate(strings_column_view const& strings, + std::vector> const& chars_table, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { if (strings.is_empty()) return make_empty_column(type_id::STRING); diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu index cd0aafc3545..335908d65d1 100644 --- a/cpp/src/strings/wrap.cu +++ b/cpp/src/strings/wrap.cu @@ -91,11 +91,10 @@ struct execute_wrap { } // namespace template -std::unique_ptr wrap( - strings_column_view const& strings, - size_type width, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) +std::unique_ptr wrap(strings_column_view const& strings, + size_type width, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(width > 0, "Positive wrap width required"); diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp index 488184f4099..11f5c9f39aa 100644 --- a/cpp/tests/strings/array_tests.cpp +++ b/cpp/tests/strings/array_tests.cpp @@ -17,11 +17,11 @@ #include #include #include +#include #include #include #include -#include #include #include @@ -61,14 +61,14 @@ class SliceParmsTest : public StringsColumnTest, TEST_P(SliceParmsTest, Slice) { std::vector h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + cudf::test::strings_column_wrapper input( + h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings)); cudf::size_type start = 3; cudf::size_type end = GetParam(); - auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end); + + auto scol = cudf::slice(input, {start, end}); + auto results = std::make_unique(scol.front()); cudf::test::strings_column_wrapper expected( h_strings.begin() + start, @@ -81,14 +81,14 @@ TEST_P(SliceParmsTest, Slice) TEST_P(SliceParmsTest, SliceAllNulls) { std::vector h_strings{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + cudf::test::strings_column_wrapper input( + h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings)); cudf::size_type start = 3; cudf::size_type end = GetParam(); - auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end); + + auto scol = cudf::slice(input, {start, end}); + auto results = std::make_unique(scol.front()); cudf::test::strings_column_wrapper expected( h_strings.begin() + start, @@ -101,11 +101,13 @@ TEST_P(SliceParmsTest, SliceAllNulls) TEST_P(SliceParmsTest, SliceAllEmpty) { std::vector h_strings{"", "", "", "", "", "", ""}; - cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); + cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end()); cudf::size_type start = 3; cudf::size_type end = GetParam(); - auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end); + + auto scol = cudf::slice(input, {start, end}); + auto results = std::make_unique(scol.front()); cudf::test::strings_column_wrapper expected(h_strings.begin() + start, h_strings.begin() + end); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); @@ -119,8 +121,8 @@ TEST_F(StringsColumnTest, SliceZeroSizeStringsColumn) { cudf::column_view zero_size_strings_column( cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto strings_view = cudf::strings_column_view(zero_size_strings_column); - auto results = cudf::strings::detail::copy_slice(strings_view, 1, 2); + auto scol = cudf::slice(zero_size_strings_column, {0, 0}); + auto results = std::make_unique(scol.front()); cudf::test::expect_column_empty(results->view()); } @@ -128,18 +130,14 @@ TEST_F(StringsColumnTest, Gather) { std::vector h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"}; cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); + h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings)); cudf::test::fixed_width_column_wrapper gather_map{{4, 1}}; auto results = cudf::gather(cudf::table_view{{strings}}, gather_map)->release(); std::vector h_expected{"aa", "bb"}; cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results.front()->view(), expected); } diff --git a/cpp/tests/strings/concatenate_tests.cpp b/cpp/tests/strings/concatenate_tests.cpp index 387f0f5c997..e4f2f7ca62c 100644 --- a/cpp/tests/strings/concatenate_tests.cpp +++ b/cpp/tests/strings/concatenate_tests.cpp @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include @@ -60,7 +60,7 @@ TEST_F(StringsConcatenateTest, Concatenate) strings_columns.push_back(strings2); strings_columns.push_back(strings3); - auto results = cudf::strings::detail::concatenate(strings_columns, cudf::get_default_stream()); + auto results = cudf::concatenate(strings_columns); cudf::test::strings_column_wrapper expected(h_strings.begin(), h_strings.end()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); @@ -74,7 +74,7 @@ TEST_F(StringsConcatenateTest, ZeroSizeStringsColumns) strings_columns.push_back(zero_size_strings_column); strings_columns.push_back(zero_size_strings_column); strings_columns.push_back(zero_size_strings_column); - auto results = cudf::strings::detail::concatenate(strings_columns, cudf::get_default_stream()); + auto results = cudf::concatenate(strings_columns); cudf::test::expect_column_empty(results->view()); } @@ -107,6 +107,6 @@ TEST_F(StringsConcatenateTest, ZeroSizeStringsPlusNormal) h_strings.data() + h_strings.size()); strings_columns.push_back(strings1); - auto results = cudf::strings::detail::concatenate(strings_columns, cudf::get_default_stream()); + auto results = cudf::concatenate(strings_columns); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings1); } diff --git a/cpp/tests/strings/fill_tests.cpp b/cpp/tests/strings/fill_tests.cpp index ed731fe39b4..c3a1710bb83 100644 --- a/cpp/tests/strings/fill_tests.cpp +++ b/cpp/tests/strings/fill_tests.cpp @@ -17,13 +17,11 @@ #include #include #include +#include #include +#include #include -#include -#include - -#include #include @@ -33,48 +31,37 @@ struct StringsFillTest : public cudf::test::BaseFixture { TEST_F(StringsFillTest, Fill) { std::vector h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - cudf::strings_column_view view(strings); + cudf::test::strings_column_wrapper input( + h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings)); + { - auto results = cudf::strings::detail::fill( - view, 1, 5, cudf::string_scalar("zz"), cudf::get_default_stream()); + auto results = cudf::fill(input, 1, 5, cudf::string_scalar("zz")); std::vector h_expected{"eee", "zz", "zz", "zz", "zz", "bbb", "ééé"}; cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::detail::fill( - view, 2, 4, cudf::string_scalar("", false), cudf::get_default_stream()); + auto results = cudf::fill(input, 2, 4, cudf::string_scalar("", false)); std::vector h_expected{"eee", "bb", nullptr, nullptr, "aa", "bbb", "ééé"}; cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::detail::fill( - view, 5, 5, cudf::string_scalar("zz"), cudf::get_default_stream()); - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, view.parent()); + auto results = cudf::fill(input, 5, 5, cudf::string_scalar("zz")); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input); } { - auto results = - cudf::strings::detail::fill(view, 0, 7, cudf::string_scalar(""), cudf::get_default_stream()); + auto results = cudf::fill(input, 0, 7, cudf::string_scalar("")); cudf::test::strings_column_wrapper expected({"", "", "", "", "", "", ""}, {1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); } { - auto results = cudf::strings::detail::fill( - view, 0, 7, cudf::string_scalar("", false), cudf::get_default_stream()); + auto results = cudf::fill(input, 0, 7, cudf::string_scalar("", false)); cudf::test::strings_column_wrapper expected({"", "", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0, 0}); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); @@ -85,27 +72,16 @@ TEST_F(StringsFillTest, ZeroSizeStringsColumns) { cudf::column_view zero_size_strings_column( cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0); - auto results = cudf::strings::detail::fill(cudf::strings_column_view(zero_size_strings_column), - 0, - 1, - cudf::string_scalar(""), - cudf::get_default_stream()); + auto results = cudf::fill(zero_size_strings_column, 0, 0, cudf::string_scalar("")); cudf::test::expect_column_empty(results->view()); } TEST_F(StringsFillTest, FillRangeError) { std::vector h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"}; - cudf::test::strings_column_wrapper strings( - h_strings.begin(), - h_strings.end(), - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })); - cudf::strings_column_view view(strings); + cudf::test::strings_column_wrapper input( + h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings)); - EXPECT_THROW( - cudf::strings::detail::fill(view, 5, 1, cudf::string_scalar(""), cudf::get_default_stream()), - cudf::logic_error); - EXPECT_THROW( - cudf::strings::detail::fill(view, 5, 9, cudf::string_scalar(""), cudf::get_default_stream()), - cudf::logic_error); + EXPECT_THROW(cudf::fill(input, 5, 1, cudf::string_scalar("")), cudf::logic_error); + EXPECT_THROW(cudf::fill(input, 5, 9, cudf::string_scalar("")), cudf::logic_error); } diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp index cd39c1e088a..da0667f54cf 100644 --- a/cpp/tests/strings/replace_tests.cpp +++ b/cpp/tests/strings/replace_tests.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -50,8 +51,8 @@ struct StringsReplaceTest : public cudf::test::BaseFixture { TEST_F(StringsReplaceTest, Replace) { - auto strings = build_corpus(); - auto strings_view = cudf::strings_column_view(strings); + auto input = build_corpus(); + auto strings_view = cudf::strings_column_view(input); // replace all occurrences of 'the ' with '++++ ' std::vector h_expected{"++++ quick brown fox jumps over ++++ lazy dog", "++++ fat cat lays next to ++++ other accénted cat", @@ -61,24 +62,29 @@ TEST_F(StringsReplaceTest, Replace) "", nullptr}; cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); + + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + auto results = cudf::strings::replace(strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ ")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ ")); + strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ "), -1, stream, mr); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ ")); + strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ "), -1, stream, mr); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(StringsReplaceTest, ReplaceReplLimit) { - auto strings = build_corpus(); - auto strings_view = cudf::strings_column_view(strings); + auto input = build_corpus(); + auto strings_view = cudf::strings_column_view(input); + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + // only remove the first occurrence of 'the ' std::vector h_expected{"quick brown fox jumps over the lazy dog", "fat cat lays next to the other accénted cat", @@ -88,23 +94,21 @@ TEST_F(StringsReplaceTest, ReplaceReplLimit) "", nullptr}; cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); auto results = cudf::strings::replace(strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1); + strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1, stream, mr); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1); + strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1, stream, mr); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(StringsReplaceTest, ReplaceReplLimitInputSliced) { - auto strings = build_corpus(); + auto input = build_corpus(); // replace first two occurrences of ' ' with '--' std::vector h_expected{"the--quick--brown fox jumps over the lazy dog", "the--fat--cat lays next to the other accénted cat", @@ -114,11 +118,11 @@ TEST_F(StringsReplaceTest, ReplaceReplLimitInputSliced) "", nullptr}; cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); std::vector slice_indices{0, 2, 2, 3, 3, 7}; - auto sliced_strings = cudf::slice(strings, slice_indices); + auto sliced_strings = cudf::slice(input, slice_indices); auto sliced_expected = cudf::slice(expected, slice_indices); for (size_t i = 0; i < sliced_strings.size(); ++i) { auto strings_view = cudf::strings_column_view(sliced_strings[i]); @@ -126,10 +130,10 @@ TEST_F(StringsReplaceTest, ReplaceReplLimitInputSliced) cudf::strings::replace(strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, sliced_expected[i]); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2); + strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2, stream, mr); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, sliced_expected[i]); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2); + strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2, stream, mr); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, sliced_expected[i]); } } @@ -139,9 +143,9 @@ TEST_F(StringsReplaceTest, ReplaceTargetOverlap) auto corpus = build_corpus(); auto corpus_view = cudf::strings_column_view(corpus); // replace all occurrences of 'the ' with '+++++++ ' - auto strings = cudf::strings::replace( + auto input = cudf::strings::replace( corpus_view, cudf::string_scalar("the "), cudf::string_scalar("++++++++ ")); - auto strings_view = cudf::strings_column_view(*strings); + auto strings_view = cudf::strings_column_view(*input); // replace all occurrences of '+++' with 'plus ' std::vector h_expected{ "plus plus ++ quick brown fox jumps over plus plus ++ lazy dog", @@ -152,60 +156,71 @@ TEST_F(StringsReplaceTest, ReplaceTargetOverlap) "", nullptr}; cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); + + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + auto results = cudf::strings::replace(strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus ")); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus ")); + strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus "), -1, stream, mr); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus ")); + strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus "), -1, stream, mr); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(StringsReplaceTest, ReplaceTargetOverlapsStrings) { - auto strings = build_corpus(); - auto strings_view = cudf::strings_column_view(strings); + auto input = build_corpus(); + auto strings_view = cudf::strings_column_view(input); + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + // replace all occurrences of 'dogthe' with '+' // should not replace anything unless it incorrectly matches across a string boundary auto results = cudf::strings::replace(strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+")); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+")); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); + strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+"), -1, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+")); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); + strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+"), -1, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input); } TEST_F(StringsReplaceTest, ReplaceNullInput) { std::vector h_null_strings(128); - auto strings = cudf::test::strings_column_wrapper( + auto input = cudf::test::strings_column_wrapper( h_null_strings.begin(), h_null_strings.end(), thrust::make_constant_iterator(false)); - auto strings_view = cudf::strings_column_view(strings); + auto strings_view = cudf::strings_column_view(input); + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); // replace all occurrences of '+' with '' // should not replace anything as input is all null auto results = cudf::strings::replace(strings_view, cudf::string_scalar("+"), cudf::string_scalar("")); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar("+"), cudf::string_scalar("")); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); + strings_view, cudf::string_scalar("+"), cudf::string_scalar(""), -1, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar("+"), cudf::string_scalar("")); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); + strings_view, cudf::string_scalar("+"), cudf::string_scalar(""), -1, stream, mr); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input); } TEST_F(StringsReplaceTest, ReplaceEndOfString) { - auto strings = build_corpus(); - auto strings_view = cudf::strings_column_view(strings); + auto input = build_corpus(); + auto strings_view = cudf::strings_column_view(input); + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + // replace all occurrences of 'in' with ' ' std::vector h_expected{"the quick brown fox jumps over the lazy dog", "the fat cat lays next to the other accénted cat", @@ -216,20 +231,18 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString) nullptr}; cudf::test::strings_column_wrapper expected( - h_expected.begin(), - h_expected.end(), - thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; })); + h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected)); auto results = cudf::strings::replace(strings_view, cudf::string_scalar("in"), cudf::string_scalar(" ")); cudf::test::expect_columns_equal(*results, expected); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar("in"), cudf::string_scalar(" ")); + strings_view, cudf::string_scalar("in"), cudf::string_scalar(" "), -1, stream, mr); cudf::test::expect_columns_equal(*results, expected); results = cudf::strings::detail::replace( - strings_view, cudf::string_scalar("in"), cudf::string_scalar(" ")); + strings_view, cudf::string_scalar("in"), cudf::string_scalar(" "), -1, stream, mr); cudf::test::expect_columns_equal(*results, expected); } diff --git a/docs/cudf/source/user_guide/io.md b/docs/cudf/source/user_guide/io.md index 9099a761f2c..3a803953502 100644 --- a/docs/cudf/source/user_guide/io.md +++ b/docs/cudf/source/user_guide/io.md @@ -170,7 +170,7 @@ If no value is set, behavior will be the same as the "STABLE" option. +=======================+========+========+==============+==============+=========+========+==============+==============+========+ | Snappy | ❌ | ❌ | Stable | Stable | ❌ | ❌ | Stable | Stable | ❌ | +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ - | ZSTD | ❌ | ❌ | Experimental | Experimental | ❌ | ❌ | Experimental | Experimental | ❌ | + | ZSTD | ❌ | ❌ | Stable | Stable | ❌ | ❌ | Stable | Stable | ❌ | +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ | DEFLATE | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | Experimental | Experimental | ❌ | +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+ diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 0b825937815..a8ca8a2c4d3 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -185,14 +185,30 @@ public static void resetScopedMaximumBytesAllocated() { * @throws RmmException if an active handler is already set */ public static void setEventHandler(RmmEventHandler handler) throws RmmException { + setEventHandler(handler, false); + } + + /** + * Sets the event handler to be called on RMM events (e.g.: allocation failure) and + * optionally enable debug mode (callbacks on every allocate and deallocate) + * + * NOTE: Only enable debug mode when necessary, as code will run much slower! + * + * @param handler event handler to invoke on RMM events or null to clear an existing handler + * @param enableDebug if true enable debug callbacks in RmmEventHandler + * (onAllocated, onDeallocated) + * @throws RmmException if an active handler is already set + */ + public static void setEventHandler(RmmEventHandler handler, + boolean enableDebug) throws RmmException { long[] allocThresholds = (handler != null) ? sortThresholds(handler.getAllocThresholds()) : null; long[] deallocThresholds = (handler != null) ? sortThresholds(handler.getDeallocThresholds()) : null; - setEventHandlerInternal(handler, allocThresholds, deallocThresholds); + setEventHandlerInternal(handler, allocThresholds, deallocThresholds, enableDebug); } /** Clears the active RMM event handler if one is set. */ public static void clearEventHandler() throws RmmException { - setEventHandlerInternal(null, null, null); + setEventHandlerInternal(null, null, null, false); } private static long[] sortThresholds(long[] thresholds) { @@ -300,7 +316,8 @@ public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) { static native void freeDeviceBuffer(long rmmBufferAddress) throws RmmException; static native void setEventHandlerInternal(RmmEventHandler handler, - long[] allocThresholds, long[] deallocThresholds) throws RmmException; + long[] allocThresholds, long[] deallocThresholds, + boolean enableDebug) throws RmmException; /** * Allocate device memory using `cudaMalloc` and return a pointer to device memory. diff --git a/java/src/main/java/ai/rapids/cudf/RmmEventHandler.java b/java/src/main/java/ai/rapids/cudf/RmmEventHandler.java index 19707b85bcb..347ef471a15 100644 --- a/java/src/main/java/ai/rapids/cudf/RmmEventHandler.java +++ b/java/src/main/java/ai/rapids/cudf/RmmEventHandler.java @@ -32,6 +32,18 @@ default boolean onAllocFailure(long sizeRequested) { "Unexpected invocation of deprecated onAllocFailure without retry count."); } + /** + * Invoked after every memory allocation when debug mode is enabled. + * @param size number of bytes allocated + */ + default void onAllocated(long size) {} + + /** + * Invoked after every memory deallocation when debug mode is enabled. + * @param size number of bytes deallocated + */ + default void onDeallocated(long size) {} + /** * Invoked on a memory allocation failure. * @param sizeRequested number of bytes that failed to allocate diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 529345b6bd8..b12f1ed0841 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -90,11 +90,14 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { void reset_scoped_max_total_allocated(std::size_t initial_value) override { std::scoped_lock lock(max_total_allocated_mutex); - scoped_allocated = 0; + scoped_allocated = initial_value; scoped_max_total_allocated = initial_value; } - std::size_t get_scoped_max_total_allocated() override { return scoped_max_total_allocated; } + std::size_t get_scoped_max_total_allocated() override { + std::scoped_lock lock(max_total_allocated_mutex); + return scoped_max_total_allocated; + } private: Upstream *const resource; @@ -123,7 +126,6 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor { if (result) { total_allocated += num_bytes; scoped_allocated += num_bytes; - std::scoped_lock lock(max_total_allocated_mutex); max_total_allocated = std::max(total_allocated.load(), max_total_allocated); scoped_max_total_allocated = std::max(scoped_allocated.load(), scoped_max_total_allocated); @@ -193,7 +195,7 @@ std::size_t get_scoped_max_total_allocated() { * @brief An RMM device memory resource adaptor that delegates to the wrapped resource * for most operations but will call Java to handle certain situations (e.g.: allocation failure). */ -class java_event_handler_memory_resource final : public device_memory_resource { +class java_event_handler_memory_resource : public device_memory_resource { public: java_event_handler_memory_resource(JNIEnv *env, jobject jhandler, jlongArray jalloc_thresholds, jlongArray jdealloc_thresholds, @@ -250,8 +252,6 @@ class java_event_handler_memory_resource final : public device_memory_resource { private: device_memory_resource *const resource; - JavaVM *jvm; - jobject handler_obj; jmethodID on_alloc_fail_method; bool use_old_alloc_fail_interface; jmethodID on_alloc_threshold_method; @@ -309,6 +309,18 @@ class java_event_handler_memory_resource final : public device_memory_resource { } } + bool supports_get_mem_info() const noexcept override { return resource->supports_get_mem_info(); } + + std::pair do_get_mem_info(rmm::cuda_stream_view stream) const override { + return resource->get_mem_info(stream); + } + + bool supports_streams() const noexcept override { return resource->supports_streams(); } + +protected: + JavaVM *jvm; + jobject handler_obj; + void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override { std::size_t total_before; void *result; @@ -348,20 +360,65 @@ class java_event_handler_memory_resource final : public device_memory_resource { check_for_threshold_callback(total_after, total_before, dealloc_thresholds, on_dealloc_threshold_method, "onDeallocThreshold", total_after); } +}; - bool supports_get_mem_info() const noexcept override { return resource->supports_get_mem_info(); } +class java_debug_event_handler_memory_resource final : public java_event_handler_memory_resource { +public: + java_debug_event_handler_memory_resource(JNIEnv *env, jobject jhandler, + jlongArray jalloc_thresholds, + jlongArray jdealloc_thresholds, + device_memory_resource *resource_to_wrap) + : java_event_handler_memory_resource(env, jhandler, jalloc_thresholds, jdealloc_thresholds, + resource_to_wrap) { + jclass cls = env->GetObjectClass(jhandler); + if (cls == nullptr) { + throw cudf::jni::jni_exception("class not found"); + } - std::pair do_get_mem_info(rmm::cuda_stream_view stream) const override { - return resource->get_mem_info(stream); + on_allocated_method = env->GetMethodID(cls, "onAllocated", "(J)V"); + if (on_allocated_method == nullptr) { + throw cudf::jni::jni_exception("onAllocated method"); + } + + on_deallocated_method = env->GetMethodID(cls, "onDeallocated", "(J)V"); + if (on_deallocated_method == nullptr) { + throw cudf::jni::jni_exception("onDeallocated method"); + } } - bool supports_streams() const noexcept override { return resource->supports_streams(); } +private: + jmethodID on_allocated_method; + jmethodID on_deallocated_method; + + void on_allocated_callback(std::size_t num_bytes, rmm::cuda_stream_view stream) { + JNIEnv *env = cudf::jni::get_jni_env(jvm); + env->CallVoidMethod(handler_obj, on_allocated_method, num_bytes); + if (env->ExceptionCheck()) { + throw std::runtime_error("onAllocated handler threw an exception"); + } + } + + void on_deallocated_callback(void *p, std::size_t size, rmm::cuda_stream_view stream) { + JNIEnv *env = cudf::jni::get_jni_env(jvm); + env->CallVoidMethod(handler_obj, on_deallocated_method, size); + } + + void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override { + void *result = java_event_handler_memory_resource::do_allocate(num_bytes, stream); + on_allocated_callback(num_bytes, stream); + return result; + } + + void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override { + java_event_handler_memory_resource::do_deallocate(p, size, stream); + on_deallocated_callback(p, size, stream); + } }; std::unique_ptr Java_memory_resource{}; void set_java_device_memory_resource(JNIEnv *env, jobject handler_obj, jlongArray jalloc_thresholds, - jlongArray jdealloc_thresholds) { + jlongArray jdealloc_thresholds, jboolean enable_debug) { if (Java_memory_resource && handler_obj != nullptr) { JNI_THROW_NEW(env, RMM_EXCEPTION_CLASS, "Another event handler is already set", ) } @@ -378,8 +435,13 @@ void set_java_device_memory_resource(JNIEnv *env, jobject handler_obj, jlongArra } if (handler_obj != nullptr) { auto resource = rmm::mr::get_current_device_resource(); - Java_memory_resource.reset(new java_event_handler_memory_resource( - env, handler_obj, jalloc_thresholds, jdealloc_thresholds, resource)); + if (enable_debug) { + Java_memory_resource.reset(new java_debug_event_handler_memory_resource( + env, handler_obj, jalloc_thresholds, jdealloc_thresholds, resource)); + } else { + Java_memory_resource.reset(new java_event_handler_memory_resource( + env, handler_obj, jalloc_thresholds, jdealloc_thresholds, resource)); + } auto replaced_resource = rmm::mr::set_current_device_resource(Java_memory_resource.get()); if (resource != replaced_resource) { rmm::mr::set_current_device_resource(replaced_resource); @@ -493,7 +555,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_shutdownInternal(JNIEnv *env, jclass clazz) { try { cudf::jni::auto_set_device(env); - set_java_device_memory_resource(env, nullptr, nullptr, nullptr); + set_java_device_memory_resource(env, nullptr, nullptr, nullptr, false); // Instead of trying to undo all of the adaptors that we added in reverse order // we just reset the base adaptor so the others will not be called any more // and then clean them up in really any order. There should be no interaction with @@ -517,7 +579,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumTotalBytesAllocated(JN } JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetScopedMaximumBytesAllocatedInternal( - JNIEnv *env, jclass, long initialValue) { + JNIEnv *env, jclass, jlong initialValue) { reset_scoped_max_total_allocated(initialValue); } @@ -562,9 +624,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeDeviceBuffer(JNIEnv *env, jcl JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setEventHandlerInternal( JNIEnv *env, jclass, jobject handler_obj, jlongArray jalloc_thresholds, - jlongArray jdealloc_thresholds) { + jlongArray jdealloc_thresholds, jboolean enable_debug) { try { - set_java_device_memory_resource(env, handler_obj, jalloc_thresholds, jdealloc_thresholds); + set_java_device_memory_resource(env, handler_obj, jalloc_thresholds, jdealloc_thresholds, + enable_debug); } CATCH_STD(env, ) } diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java index 18ff5f4081e..c081f51c9f2 100644 --- a/java/src/test/java/ai/rapids/cudf/RmmTest.java +++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java @@ -115,7 +115,7 @@ public void testScopedMaxOutstanding(int rmmAllocMode) { try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) { Rmm.resetScopedMaximumBytesAllocated(1024); try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) { - assertEquals(20480, Rmm.getScopedMaximumBytesAllocated()); + assertEquals(21504, Rmm.getScopedMaximumBytesAllocated()); assertEquals(21504, Rmm.getMaximumTotalBytesAllocated()); } } @@ -157,6 +157,8 @@ public void testEventHandler(int rmmAllocMode) { AtomicInteger invokedCount = new AtomicInteger(); AtomicLong amountRequested = new AtomicLong(); AtomicInteger timesRetried = new AtomicInteger(); + AtomicLong totalAllocated = new AtomicLong(); + AtomicLong totalDeallocated = new AtomicLong(); RmmEventHandler handler = new BaseRmmEventHandler() { @Override @@ -166,6 +168,16 @@ public boolean onAllocFailure(long sizeRequested, int retryCount) { amountRequested.set(sizeRequested); return count != 3; } + + @Override + public void onAllocated(long sizeAllocated) { + totalAllocated.addAndGet(sizeAllocated); + } + + @Override + public void onDeallocated(long sizeDeallocated) { + totalDeallocated.addAndGet(sizeDeallocated); + } }; Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024); @@ -175,6 +187,10 @@ public boolean onAllocFailure(long sizeRequested, int retryCount) { assertTrue(addr.address != 0); assertEquals(0, invokedCount.get()); + // by default, we don't get callbacks on allocated or deallocated + assertEquals(0, totalAllocated.get()); + assertEquals(0, totalDeallocated.get()); + // Try to allocate too much long requested = TOO_MUCH_MEMORY; try { @@ -192,6 +208,14 @@ public boolean onAllocFailure(long sizeRequested, int retryCount) { requested = 8192; addr = Rmm.alloc(requested); addr.close(); + + // test the debug event handler + Rmm.clearEventHandler(); + Rmm.setEventHandler(handler, /*enableDebug*/ true); + addr = Rmm.alloc(1024); + addr.close(); + assertEquals(1024, totalAllocated.get()); + assertEquals(1024, totalDeallocated.get()); } @Test diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 22f8d27f9e8..6c17b492f8a 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -847,6 +847,8 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool: raise NotImplementedError() def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: + if self.dtype == dtype: + return self if is_categorical_dtype(dtype): return self.as_categorical_column(dtype, **kwargs) diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 0beb07bb591..5ee9024a0d8 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -399,4 +399,12 @@ def _get_decimal_type(lhs_dtype, rhs_dtype, op): # to try the next dtype continue - raise OverflowError("Maximum supported decimal type is Decimal128") + # Instead of raising an overflow error, we create a `Decimal128Dtype` + # with max possible scale & precision, see example of this demonstration + # here: https://learn.microsoft.com/en-us/sql/t-sql/data-types/ + # precision-scale-and-length-transact-sql?view=sql-server-ver16#examples + scale = min( + scale, cudf.Decimal128Dtype.MAX_PRECISION - (precision - scale) + ) + precision = min(cudf.Decimal128Dtype.MAX_PRECISION, max_precision) + return cudf.Decimal128Dtype(precision=precision, scale=scale) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 57469c0ff72..49f7101183e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1045,6 +1045,206 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): zip(self._column_names, data_columns), self._index ) + @_cudf_nvtx_annotate + def truncate(self, before=None, after=None, axis=0, copy=True): + """ + Truncate a Series or DataFrame before and after some index value. + + This is a useful shorthand for boolean indexing based on index + values above or below certain thresholds. + + Parameters + ---------- + before : date, str, int + Truncate all rows before this index value. + after : date, str, int + Truncate all rows after this index value. + axis : {0 or 'index', 1 or 'columns'}, optional + Axis to truncate. Truncates the index (rows) by default. + copy : bool, default is True, + Return a copy of the truncated section. + + Returns + ------- + The truncated Series or DataFrame. + + Notes + ----- + If the index being truncated contains only datetime values, + `before` and `after` may be specified as strings instead of + Timestamps. + + .. pandas-compat:: + **DataFrame.truncate, Series.truncate** + + The ``copy`` parameter is only present for API compatibility, but + ``copy=False`` is not supported. This method always generates a + copy. + + Examples + -------- + **Series** + + >>> import cudf + >>> cs1 = cudf.Series([1, 2, 3, 4]) + >>> cs1 + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> cs1.truncate(before=1, after=2) + 1 2 + 2 3 + dtype: int64 + + >>> import cudf + >>> dates = cudf.date_range( + ... '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s' + ... ) + >>> cs2 = cudf.Series(range(len(dates)), index=dates) + >>> cs2 + 2021-01-01 23:45:00 0 + 2021-01-01 23:45:01 1 + 2021-01-01 23:45:02 2 + 2021-01-01 23:45:03 3 + 2021-01-01 23:45:04 4 + 2021-01-01 23:45:05 5 + 2021-01-01 23:45:06 6 + 2021-01-01 23:45:07 7 + 2021-01-01 23:45:08 8 + 2021-01-01 23:45:09 9 + 2021-01-01 23:45:10 10 + 2021-01-01 23:45:11 11 + 2021-01-01 23:45:12 12 + 2021-01-01 23:45:13 13 + 2021-01-01 23:45:14 14 + 2021-01-01 23:45:15 15 + 2021-01-01 23:45:16 16 + 2021-01-01 23:45:17 17 + 2021-01-01 23:45:18 18 + 2021-01-01 23:45:19 19 + 2021-01-01 23:45:20 20 + 2021-01-01 23:45:21 21 + 2021-01-01 23:45:22 22 + 2021-01-01 23:45:23 23 + 2021-01-01 23:45:24 24 + ... + 2021-01-01 23:45:56 56 + 2021-01-01 23:45:57 57 + 2021-01-01 23:45:58 58 + 2021-01-01 23:45:59 59 + dtype: int64 + + + >>> cs2.truncate( + ... before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ... ) + 2021-01-01 23:45:18 18 + 2021-01-01 23:45:19 19 + 2021-01-01 23:45:20 20 + 2021-01-01 23:45:21 21 + 2021-01-01 23:45:22 22 + 2021-01-01 23:45:23 23 + 2021-01-01 23:45:24 24 + 2021-01-01 23:45:25 25 + 2021-01-01 23:45:26 26 + 2021-01-01 23:45:27 27 + dtype: int64 + + >>> cs3 = cudf.Series({'A': 1, 'B': 2, 'C': 3, 'D': 4}) + >>> cs3 + A 1 + B 2 + C 3 + D 4 + dtype: int64 + + >>> cs3.truncate(before='B', after='C') + B 2 + C 3 + dtype: int64 + + **DataFrame** + + >>> df = cudf.DataFrame({ + ... 'A': ['a', 'b', 'c', 'd', 'e'], + ... 'B': ['f', 'g', 'h', 'i', 'j'], + ... 'C': ['k', 'l', 'm', 'n', 'o'] + ... }, index=[1, 2, 3, 4, 5]) + >>> df + A B C + 1 a f k + 2 b g l + 3 c h m + 4 d i n + 5 e j o + + >>> df.truncate(before=2, after=4) + A B C + 2 b g l + 3 c h m + 4 d i n + + >>> df.truncate(before="A", after="B", axis="columns") + A B + 1 a f + 2 b g + 3 c h + 4 d i + 5 e j + + >>> import cudf + >>> dates = cudf.date_range( + ... '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s' + ... ) + >>> df2 = cudf.DataFrame(data={'A': 1, 'B': 2}, index=dates) + >>> df2.head() + A B + 2021-01-01 23:45:00 1 2 + 2021-01-01 23:45:01 1 2 + 2021-01-01 23:45:02 1 2 + 2021-01-01 23:45:03 1 2 + 2021-01-01 23:45:04 1 2 + + >>> df2.truncate( + ... before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ... ) + A B + 2021-01-01 23:45:18 1 2 + 2021-01-01 23:45:19 1 2 + 2021-01-01 23:45:20 1 2 + 2021-01-01 23:45:21 1 2 + 2021-01-01 23:45:22 1 2 + 2021-01-01 23:45:23 1 2 + 2021-01-01 23:45:24 1 2 + 2021-01-01 23:45:25 1 2 + 2021-01-01 23:45:26 1 2 + 2021-01-01 23:45:27 1 2 + """ + if not copy: + raise ValueError("Truncating with copy=False is not supported.") + axis = self._get_axis_from_axis_arg(axis) + ax = self._index if axis == 0 else self._data.to_pandas_index() + + if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing: + raise ValueError("truncate requires a sorted index") + + if type(ax) is cudf.core.index.DatetimeIndex: + before = pd.to_datetime(before) + after = pd.to_datetime(after) + + if before is not None and after is not None and before > after: + raise ValueError(f"Truncate: {after} must be after {before}") + + if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1: + before, after = after, before + + slicer = [slice(None, None)] * self.ndim + slicer[axis] = slice(before, after) + return self.loc[tuple(slicer)].copy() + @cached_property def loc(self): """Select rows and columns by label or boolean mask. diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index e05e8662fe4..e516177ad29 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -392,4 +392,6 @@ def _dispatch_scalar_unaop(self, op): return getattr(self.value, op)() def astype(self, dtype): + if self.dtype == dtype: + return self return Scalar(self.value, dtype) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 70e8c3d6860..8c30ae258db 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -35,6 +35,7 @@ is_integer_dtype, is_list_dtype, is_scalar, + is_string_dtype, is_struct_dtype, ) from cudf.core.abc import Serializable @@ -214,19 +215,20 @@ def __setitem__(self, key, value): value = column.as_column(value) if ( - not isinstance( - self._frame._column.dtype, - (cudf.core.dtypes.DecimalDtype, cudf.CategoricalDtype), + ( + _is_non_decimal_numeric_dtype(self._frame._column.dtype) + or is_string_dtype(self._frame._column.dtype) ) and hasattr(value, "dtype") and _is_non_decimal_numeric_dtype(value.dtype) ): # normalize types if necessary: - if not is_integer(key): - to_dtype = np.result_type( - value.dtype, self._frame._column.dtype - ) - value = value.astype(to_dtype) + # In contrast to Column.__setitem__ (which downcasts the value to + # the dtype of the column) here we upcast the series to the + # larger data type mimicking pandas + to_dtype = np.result_type(value.dtype, self._frame._column.dtype) + value = value.astype(to_dtype) + if to_dtype != self._frame._column.dtype: self._frame._column._mimic_inplace( self._frame._column.astype(to_dtype), inplace=True ) @@ -283,6 +285,10 @@ def __setitem__(self, key, value): self._frame.iloc[key] = value def _loc_to_iloc(self, arg): + if isinstance(arg, tuple) and arg and isinstance(arg[0], slice): + if len(arg) > 1: + raise IndexError("Too many Indexers") + arg = arg[0] if _is_scalar_or_zero_d_array(arg): if not _is_non_decimal_numeric_dtype(self._frame.index.dtype): # TODO: switch to cudf.utils.dtypes.is_integer(arg) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 1fcfbe5fc91..58bee95326f 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -289,6 +289,62 @@ def test_axes(data): assert_eq(e, a) +def test_dataframe_truncate_axis_0(): + df = cudf.DataFrame( + { + "A": ["a", "b", "c", "d", "e"], + "B": ["f", "g", "h", "i", "j"], + "C": ["k", "l", "m", "n", "o"], + }, + index=[1, 2, 3, 4, 5], + ) + pdf = df.to_pandas() + + expected = pdf.truncate(before=2, after=4, axis="index") + actual = df.truncate(before=2, after=4, axis="index") + assert_eq(actual, expected) + + expected = pdf.truncate(before=1, after=4, axis=0) + actual = df.truncate(before=1, after=4, axis=0) + assert_eq(expected, actual) + + +def test_dataframe_truncate_axis_1(): + df = cudf.DataFrame( + { + "A": ["a", "b", "c", "d", "e"], + "B": ["f", "g", "h", "i", "j"], + "C": ["k", "l", "m", "n", "o"], + }, + index=[1, 2, 3, 4, 5], + ) + pdf = df.to_pandas() + + expected = pdf.truncate(before="A", after="B", axis="columns") + actual = df.truncate(before="A", after="B", axis="columns") + assert_eq(actual, expected) + + expected = pdf.truncate(before="A", after="B", axis=1) + actual = df.truncate(before="A", after="B", axis=1) + assert_eq(actual, expected) + + +def test_dataframe_truncate_datetimeindex(): + dates = cudf.date_range( + "2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s" + ) + df = cudf.DataFrame(data={"A": 1, "B": 2}, index=dates) + pdf = df.to_pandas() + expected = pdf.truncate( + before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ) + actual = df.truncate( + before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ) + + assert_eq(actual, expected) + + def test_series_init_none(): # test for creating empty series diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py index c37381a3af9..c7174adf342 100644 --- a/python/cudf/cudf/tests/test_decimal.py +++ b/python/cudf/cudf/tests/test_decimal.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. import decimal from decimal import Decimal @@ -377,3 +377,9 @@ def test_decimal_invalid_precision(): with pytest.raises(pa.ArrowInvalid): _ = cudf.Series([Decimal("300")], dtype=cudf.Decimal64Dtype(2, 1)) + + +def test_decimal_overflow(): + s = cudf.Series([Decimal("0.0009384233522166997927180531650178250")]) + result = s * s + assert_eq(cudf.Decimal128Dtype(precision=38, scale=37), result.dtype) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 00d6e0b2899..34aff2c34fe 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -897,3 +897,49 @@ def test_json_dtypes_nested_data(): pdf, schema=df.to_arrow().schema, safe=False ) assert df.to_arrow().equals(pa_table_pdf) + + +@pytest.mark.parametrize( + "tag, data", + [ + ( + "normal", + """\ +{"a": 1, "b": 2} +{"a": 3, "b": 4}""", + ), + ( + "multiple", + """\ + { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } + { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } + { "a": { "y" : 6}, "b" : [6 ], "c": 13 } + { "a": { "y" : 6}, "b" : [7 ], "c": 14 }""", + ), + ( + "reordered", + """\ + { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } + { "a": { "y" : 6}, "c": 12 , "b" : [4, 5 ]} + { "b" : [6 ], "a": { "y" : 6}, "c": 13} + { "c" : 14, "a": { "y" : 6}, "b" : [7 ]} +""", + ), + ( + "missing", + """ + { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 } + { "a": { "y" : 6}, "b" : [4, 5 ]} + { "a": { "y" : 6}, "c": 13 } + { "a": { "y" : 6}, "b" : [7 ], "c": 14 } + """, + ), + ], +) +def test_order_nested_json_reader(tag, data): + expected = cudf.read_json(StringIO(data), engine="pandas", lines=True) + target = cudf.read_json( + StringIO(data), engine="cudf_experimental", lines=True + ) + + assert_eq(expected, target, check_dtype=True) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index c0b99f56238..d5af2899bb0 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1614,6 +1614,47 @@ def test_axes(data): assert_eq(e, a) +def test_series_truncate(): + csr = cudf.Series([1, 2, 3, 4]) + psr = csr.to_pandas() + + assert_eq(csr.truncate(), psr.truncate()) + assert_eq(csr.truncate(1, 2), psr.truncate(1, 2)) + assert_eq(csr.truncate(before=1, after=2), psr.truncate(before=1, after=2)) + + +def test_series_truncate_errors(): + csr = cudf.Series([1, 2, 3, 4]) + with pytest.raises(ValueError): + csr.truncate(axis=1) + with pytest.raises(ValueError): + csr.truncate(copy=False) + + csr.index = [3, 2, 1, 6] + psr = csr.to_pandas() + assert_exceptions_equal( + lfunc=csr.truncate, + rfunc=psr.truncate, + ) + + +def test_series_truncate_datetimeindex(): + dates = cudf.date_range( + "2021-01-01 23:45:00", "2021-01-02 23:46:00", freq="s" + ) + csr = cudf.Series(range(len(dates)), index=dates) + psr = csr.to_pandas() + + assert_eq( + csr.truncate( + before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ), + psr.truncate( + before="2021-01-01 23:45:18", after="2021-01-01 23:45:27" + ), + ) + + @pytest.mark.parametrize( "data", [ diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py index 13b342e6c3b..ac9dbecda65 100644 --- a/python/cudf/cudf/tests/test_setitem.py +++ b/python/cudf/cudf/tests/test_setitem.py @@ -297,3 +297,48 @@ def test_series_slice_setitem_struct(): actual[0:3] = cudf.Scalar({"a": {"b": 5050}, "b": 101}) assert_eq(actual, expected) + + +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +@pytest.mark.parametrize("indices", [0, [1, 2]]) +def test_series_setitem_upcasting(dtype, indices): + sr = pd.Series([0, 0, 0], dtype=dtype) + cr = cudf.from_pandas(sr) + assert_eq(sr, cr) + # Must be a non-integral floating point value that can't be losslessly + # converted to float32, otherwise pandas will try and match the source + # column dtype. + new_value = np.float64(np.pi) + col_ref = cr._column + sr[indices] = new_value + cr[indices] = new_value + if PANDAS_GE_150: + assert_eq(sr, cr) + else: + # pandas bug, incorrectly fails to upcast from float32 to float64 + assert_eq(sr.values, cr.values) + if dtype == np.float64: + # no-op type cast should not modify backing column + assert col_ref == cr._column + + +# TODO: these two tests could perhaps be changed once specifics of +# pandas compat wrt upcasting are decided on; this is just baking in +# status-quo. +def test_series_setitem_upcasting_string_column(): + sr = pd.Series([0, 0, 0], dtype=str) + cr = cudf.from_pandas(sr) + new_value = np.float64(10.5) + sr[0] = str(new_value) + cr[0] = new_value + assert_eq(sr, cr) + + +def test_series_setitem_upcasting_string_value(): + sr = cudf.Series([0, 0, 0], dtype=int) + # This is a distinction with pandas, which lets you instead make an + # object column with ["10", 0, 0] + sr[0] = "10" + assert_eq(pd.Series([10, 0, 0], dtype=int), sr) + with pytest.raises(ValueError): + sr[0] = "non-integer" diff --git a/python/strings_udf/cpp/CMakeLists.txt b/python/strings_udf/cpp/CMakeLists.txt index e5b4aca7076..3e58d10d6e2 100644 --- a/python/strings_udf/cpp/CMakeLists.txt +++ b/python/strings_udf/cpp/CMakeLists.txt @@ -92,6 +92,10 @@ endfunction() # Create the shim library for each architecture. set(SHIM_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true) +# always build a default PTX file in case RAPIDS_NO_INITIALIZE is set and the device cc can't be +# safely queried through a context +list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "60") + list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-real" "") list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-virtual" "") list(SORT CMAKE_CUDA_ARCHITECTURES) diff --git a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu index 7927740fd49..b4d5014d9e0 100644 --- a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu +++ b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu @@ -58,7 +58,7 @@ std::unique_ptr to_string_view_array(cudf::column_view const { return std::make_unique( std::move(cudf::strings::detail::create_string_vector_from_column( - cudf::strings_column_view(input), stream) + cudf::strings_column_view(input), stream, rmm::mr::get_current_device_resource()) .release())); } diff --git a/python/strings_udf/strings_udf/__init__.py b/python/strings_udf/strings_udf/__init__.py index 24f1a2d3bda..2222fb72009 100644 --- a/python/strings_udf/strings_udf/__init__.py +++ b/python/strings_udf/strings_udf/__init__.py @@ -43,7 +43,8 @@ def maybe_patch_numba_linker(driver_version): def _get_ptx_file(): if "RAPIDS_NO_INITIALIZE" in os.environ: - cc = int(os.environ.get("STRINGS_UDF_CC", "52")) + # shim_60.ptx is always built + cc = int(os.environ.get("STRINGS_UDF_CC", "60")) else: dev = cuda.get_current_device()