diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp index a6167d983c5..96322159f0f 100644 --- a/cpp/include/cudf/column/column_factories.hpp +++ b/cpp/include/cudf/column/column_factories.hpp @@ -410,63 +410,6 @@ std::unique_ptr<column> make_strings_column( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Construct a STRING type column given a device span of chars encoded as UTF-8, a device - * span of byte offsets identifying individual strings within the char vector, and an optional - * null bitmask. - * - * @deprecated Since 24.02 - * - * `offsets.front()` must always be zero. - * - * The total number of char bytes must not exceed the maximum size of size_type. Use the - * strings_column_view class to perform strings operations on this type of column. - * - * This function makes a deep copy of the strings, offsets, null_mask to create a new column. - * - * @param strings The device span of chars in device memory. This char vector is expected to be - * UTF-8 encoded characters. - * @param offsets The device span of byte offsets in device memory. The number of elements is - * one more than the total number of strings so the `offsets.back()` is the total number of bytes - * in the strings array. `offsets.front()` must always be 0 to point to the beginning of `strings`. - * @param null_mask Device span containing the null element indicator bitmask. Arrow format for - * nulls is used for interpreting this bitmask. - * @param null_count The number of null string entries - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used for allocation of the column's `null_mask` and children - * columns' device memory - * @return Constructed strings column - */ -[[deprecated]] std::unique_ptr<column> make_strings_column( - cudf::device_span<char const> strings, - cudf::device_span<size_type const> offsets, - cudf::device_span<bitmask_type const> null_mask, - size_type null_count, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - -/** - * @brief Construct a STRING type column given offsets column, chars columns, and null mask and null - * count. - * - * The columns and mask are moved into the resulting strings column. - * - * @param num_strings The number of strings the column represents. - * @param offsets_column The column of offset values for this column. The number of elements is - * one more than the total number of strings so the `offset[last] - offset[0]` is the total number - * of bytes in the strings vector. - * @param chars_column The column of char bytes for all the strings for this column. Individual - * strings are identified by the offsets and the nullmask. - * @param null_count The number of null string entries. - * @param null_mask The bits specifying the null strings in device memory. Arrow format for - * nulls is used for interpreting this bitmask. - * @return Constructed strings column - */ -[[deprecated]] std::unique_ptr<column> make_strings_column(size_type num_strings, - std::unique_ptr<column> offsets_column, - std::unique_ptr<column> chars_column, - size_type null_count, - rmm::device_buffer&& null_mask); /** * @brief Construct a STRING type column given offsets column, chars columns, and null mask and null * count. @@ -490,29 +433,6 @@ std::unique_ptr<column> make_strings_column(size_type num_strings, size_type null_count, rmm::device_buffer&& null_mask); -/** - * @brief Construct a STRING type column given offsets, columns, and optional null count and null - * mask. - * - * @deprecated Since 24.02 - * - * @param[in] num_strings The number of strings the column represents. - * @param[in] offsets The offset values for this column. The number of elements is one more than the - * total number of strings so the `offset[last] - offset[0]` is the total number of bytes in the - * strings vector. - * @param[in] chars The char bytes for all the strings for this column. Individual strings are - * identified by the offsets and the nullmask. - * @param[in] null_mask The bits specifying the null strings in device memory. Arrow format for - * nulls is used for interpreting this bitmask. - * @param[in] null_count The number of null string entries. - * @return Constructed strings column - */ -[[deprecated]] std::unique_ptr<column> make_strings_column(size_type num_strings, - rmm::device_uvector<size_type>&& offsets, - rmm::device_uvector<char>&& chars, - rmm::device_buffer&& null_mask, - size_type null_count); - /** * @brief Construct a LIST type column given offsets column, child column, null mask and null * count. diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp index e6546777f3f..840a2dd1165 100644 --- a/cpp/include/cudf/strings/strings_column_view.hpp +++ b/cpp/include/cudf/strings/strings_column_view.hpp @@ -103,16 +103,6 @@ class strings_column_view : private column_view { */ [[nodiscard]] offset_iterator offsets_end() const; - /** - * @brief Returns the internal column of chars - * - * @throw cudf::logic_error if this is an empty column - * @param stream CUDA stream used for device memory operations and kernel launches - * @return The chars column - */ - [[deprecated]] [[nodiscard]] column_view chars( - rmm::cuda_stream_view stream = cudf::get_default_stream()) const; - /** * @brief Returns the number of bytes in the chars child column. * diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index a9ddcfa12a2..b96c799cf4d 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -16,9 +16,9 @@ #include <cudf/column/column_device_view.cuh> #include <cudf/column/column_factories.hpp> -#include <cudf/detail/get_value.cuh> #include <cudf/detail/null_mask.hpp> #include <cudf/detail/nvtx/ranges.hpp> +#include <cudf/detail/offsets_iterator_factory.cuh> #include <cudf/detail/utilities/cuda.cuh> #include <cudf/detail/utilities/integer_utils.hpp> #include <cudf/strings/convert/convert_urls.hpp> @@ -34,10 +34,6 @@ #include <cub/cub.cuh> -#include <thrust/scan.h> - -#include <algorithm> - namespace cudf { namespace strings { namespace detail { @@ -282,7 +278,7 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings, template <size_type num_warps_per_threadblock, size_type char_block_size> CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings, char* const out_chars, - size_type const* const out_offsets) + cudf::detail::input_offsetalator const out_offsets) { constexpr int halo_size = 2; __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size * 2]; @@ -384,38 +380,25 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings, auto const num_threadblocks = std::min(65536, cudf::util::div_rounding_up_unsafe(strings_count, num_warps_per_threadblock)); - auto offset_count = strings_count + 1; auto const d_strings = column_device_view::create(strings.parent(), stream); - // build offsets column - auto offsets_column = make_numeric_column( - data_type{type_to_id<size_type>()}, offset_count, mask_state::UNALLOCATED, stream, mr); - - // count number of bytes in each string after decoding and store it in offsets_column - auto offsets_view = offsets_column->view(); - auto offsets_mutable_view = offsets_column->mutable_view(); + // build offsets column by computing the output row sizes and scanning the results + auto row_sizes = rmm::device_uvector<size_type>(strings_count, stream); url_decode_char_counter<num_warps_per_threadblock, char_block_size> - <<<num_threadblocks, threadblock_size, 0, stream.value()>>>( - *d_strings, offsets_mutable_view.begin<size_type>()); - - // use scan to transform number of bytes into offsets - thrust::exclusive_scan(rmm::exec_policy(stream), - offsets_view.begin<size_type>(), - offsets_view.end<size_type>(), - offsets_mutable_view.begin<size_type>()); - - // copy the total number of characters of all strings combined (last element of the offset column) - // to the host memory - auto out_chars_bytes = cudf::detail::get_value<size_type>(offsets_view, offset_count - 1, stream); + <<<num_threadblocks, threadblock_size, 0, stream.value()>>>(*d_strings, row_sizes.data()); + // performs scan on the sizes and builds the appropriate offsets column + auto [offsets_column, out_chars_bytes] = cudf::strings::detail::make_offsets_child_column( + row_sizes.begin(), row_sizes.end(), stream, mr); // create the chars column rmm::device_uvector<char> chars(out_chars_bytes, stream, mr); auto d_out_chars = chars.data(); + auto const offsets = + cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view()); // decode and copy the characters from the input column to the output column url_decode_char_replacer<num_warps_per_threadblock, char_block_size> - <<<num_threadblocks, threadblock_size, 0, stream.value()>>>( - *d_strings, d_out_chars, offsets_column->view().begin<size_type>()); + <<<num_threadblocks, threadblock_size, 0, stream.value()>>>(*d_strings, d_out_chars, offsets); // copy null mask rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu index 5ba4d8d3132..0f1b9e3baae 100644 --- a/cpp/src/strings/strings_column_factories.cu +++ b/cpp/src/strings/strings_column_factories.cu @@ -56,25 +56,6 @@ std::unique_ptr<column> make_strings_column( return cudf::strings::detail::make_strings_column(strings.begin(), strings.end(), stream, mr); } -std::unique_ptr<column> make_strings_column(device_span<char> chars, - device_span<size_type> offsets, - size_type null_count, - rmm::device_buffer&& null_mask, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - - return cudf::strings::detail::make_strings_column(chars.begin(), - chars.end(), - offsets.begin(), - offsets.end(), - null_count, - std::move(null_mask), - stream, - mr); -} - std::unique_ptr<column> make_strings_column(device_span<string_view const> string_views, string_view null_placeholder, rmm::cuda_stream_view stream, @@ -88,57 +69,6 @@ std::unique_ptr<column> make_strings_column(device_span<string_view const> strin it_pair, it_pair + string_views.size(), stream, mr); } -// Create a strings-type column from device vector of chars and vector of offsets. -std::unique_ptr<column> make_strings_column(cudf::device_span<char const> strings, - cudf::device_span<size_type const> offsets, - cudf::device_span<bitmask_type const> valid_mask, - size_type null_count, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - - // build null bitmask - rmm::device_buffer null_mask{ - valid_mask.data(), valid_mask.size() * sizeof(bitmask_type), stream, mr}; - - return cudf::strings::detail::make_strings_column(strings.begin(), - strings.end(), - offsets.begin(), - offsets.end(), - null_count, - std::move(null_mask), - stream, - mr); -} - -// -std::unique_ptr<column> make_strings_column(size_type num_strings, - std::unique_ptr<column> offsets_column, - std::unique_ptr<column> chars_column, - size_type null_count, - rmm::device_buffer&& null_mask) -{ - CUDF_FUNC_RANGE(); - - if (num_strings == 0) { return make_empty_column(type_id::STRING); } - - if (null_count > 0) CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); - CUDF_EXPECTS(num_strings == offsets_column->size() - 1, - "Invalid offsets column size for strings column."); - CUDF_EXPECTS(offsets_column->null_count() == 0, "Offsets column should not contain nulls"); - CUDF_EXPECTS(chars_column->null_count() == 0, "Chars column should not contain nulls"); - - std::vector<std::unique_ptr<column>> children; - children.emplace_back(std::move(offsets_column)); - return std::make_unique<column>(data_type{type_id::STRING}, - num_strings, - std::move(*(chars_column->release().data.release())), - std::move(null_mask), - null_count, - std::move(children)); -} - std::unique_ptr<column> make_strings_column(size_type num_strings, std::unique_ptr<column> offsets_column, rmm::device_buffer&& chars_buffer, diff --git a/cpp/src/strings/strings_column_view.cpp b/cpp/src/strings/strings_column_view.cpp index 27a8c6fb17f..6be22d8e729 100644 --- a/cpp/src/strings/strings_column_view.cpp +++ b/cpp/src/strings/strings_column_view.cpp @@ -45,13 +45,6 @@ strings_column_view::offset_iterator strings_column_view::offsets_end() const return offsets_begin() + size() + 1; } -column_view strings_column_view::chars(rmm::cuda_stream_view stream) const -{ - CUDF_EXPECTS(num_children() > 0, "strings column has no children"); - return column_view( - data_type{type_id::INT8}, chars_size(stream), chars_begin(stream), nullptr, 0, 0); -} - size_type strings_column_view::chars_size(rmm::cuda_stream_view stream) const noexcept { if (size() == 0) return 0; diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index 7e66a7ab4ba..f18cb32a091 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -1,5 +1,6 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. +import contextlib import importlib import random import time @@ -215,45 +216,66 @@ def test_spilling_buffer(manager: SpillManager): buf.spill(target="cpu") -def test_environment_variables(monkeypatch): - def reload_options(): - # In order to enabling monkey patching of the environment variables - # mark the global manager as uninitialized. - set_global_manager(None) - cudf.core.buffer.spill_manager._global_manager_uninitialized = True - importlib.reload(cudf.options) - - monkeypatch.setenv("CUDF_SPILL_ON_DEMAND", "off") - monkeypatch.setenv("CUDF_SPILL", "off") - reload_options() - assert get_global_manager() is None - - monkeypatch.setenv("CUDF_SPILL", "on") - reload_options() - manager = get_global_manager() - assert isinstance(manager, SpillManager) - assert manager._spill_on_demand is False - assert manager._device_memory_limit is None - assert manager.statistics.level == 0 - - monkeypatch.setenv("CUDF_SPILL_DEVICE_LIMIT", "1000") - reload_options() - manager = get_global_manager() - assert isinstance(manager, SpillManager) - assert manager._device_memory_limit == 1000 - assert manager.statistics.level == 0 - - monkeypatch.setenv("CUDF_SPILL_STATS", "1") - reload_options() - manager = get_global_manager() - assert isinstance(manager, SpillManager) - assert manager.statistics.level == 1 - - monkeypatch.setenv("CUDF_SPILL_STATS", "2") - reload_options() - manager = get_global_manager() - assert isinstance(manager, SpillManager) - assert manager.statistics.level == 2 +def _reload_options(): + # In order to enabling monkey patching of the environment variables + # mark the global manager as uninitialized. + set_global_manager(None) + cudf.core.buffer.spill_manager._global_manager_uninitialized = True + importlib.reload(cudf.options) + + +@contextlib.contextmanager +def _get_manager_in_env(monkeypatch, var_vals): + with monkeypatch.context() as m: + for var, val in var_vals: + m.setenv(var, val) + _reload_options() + yield get_global_manager() + _reload_options() + + +def test_environment_variables_spill_off(monkeypatch): + with _get_manager_in_env( + monkeypatch, + [("CUDF_SPILL", "off"), ("CUDF_SPILL_ON_DEMAND", "off")], + ) as manager: + assert manager is None + + +def test_environment_variables_spill_on(monkeypatch): + with _get_manager_in_env( + monkeypatch, + [("CUDF_SPILL", "on")], + ) as manager: + assert isinstance(manager, SpillManager) + assert manager._spill_on_demand is True + assert manager._device_memory_limit is None + assert manager.statistics.level == 0 + + +def test_environment_variables_device_limit(monkeypatch): + with _get_manager_in_env( + monkeypatch, + [("CUDF_SPILL", "on"), ("CUDF_SPILL_DEVICE_LIMIT", "1000")], + ) as manager: + assert isinstance(manager, SpillManager) + assert manager._device_memory_limit == 1000 + assert manager.statistics.level == 0 + + +@pytest.mark.parametrize("level", (1, 2)) +def test_environment_variables_spill_stats(monkeypatch, level): + with _get_manager_in_env( + monkeypatch, + [ + ("CUDF_SPILL", "on"), + ("CUDF_SPILL_DEVICE_LIMIT", "1000"), + ("CUDF_SPILL_STATS", f"{level}"), + ], + ) as manager: + assert isinstance(manager, SpillManager) + assert manager._device_memory_limit == 1000 + assert manager.statistics.level == level def test_spill_device_memory(manager: SpillManager): @@ -507,6 +529,10 @@ def test_serialize_cuda_dataframe(manager: SpillManager): assert_eq(df1, df2) +@pytest.mark.skip( + reason="This test is not safe because other tests may have enabled" + "spilling and already modified rmm's global state" +) def test_get_rmm_memory_resource_stack(): mr1 = rmm.mr.get_current_device_resource() assert all(