diff --git a/README.md b/README.md index 0602647dad7..a64e39452ec 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,10 @@ print(tips_df.groupby("size").tip_percentage.mean()) - [libcudf (C++/CUDA) documentation](https://docs.rapids.ai/api/libcudf/stable/) - [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate. +See the [RAPIDS install page](https://docs.rapids.ai/install) for +the most up-to-date information and commands for installing cuDF +and other RAPIDS packages. + ## Installation ### CUDA/GPU requirements @@ -64,6 +68,24 @@ print(tips_df.groupby("size").tip_percentage.mean()) * NVIDIA driver 450.80.02+ * Volta architecture or better (Compute Capability >=7.0) +### Pip + +cuDF can be installed via `pip` from the NVIDIA Python Package Index. +Be sure to select the appropriate cuDF package depending +on the major version of CUDA available in your environment: + +For CUDA 11.x: + +```bash +pip install --extra-index-url=https://pypi.nvidia.com cudf-cu11 +``` + +For CUDA 12.x: + +```bash +pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12 +``` + ### Conda cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel: diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 98b17bc0a64..5a4bf3e0dbc 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -40,8 +40,8 @@ popd rapids-logger "Build Python docs" pushd docs/cudf -make dirhtml O="-j 4" -make text O="-j 4" +make dirhtml +make text mkdir -p "${RAPIDS_DOCS_DIR}/cudf/"{html,txt} mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html" mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt" diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 9c674518810..c4b794e81f7 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -48,7 +48,7 @@ fi if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file} sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file} - sed -i "/ptxcompiler/d" ${pyproject_file} + sed -i "s/ptxcompiler/pynvjitlink/g" ${pyproject_file} sed -i "/cubinlinker/d" ${pyproject_file} fi diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index b5e2566fd0d..8081d9de8b9 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -75,7 +75,7 @@ dependencies: - pydata-sphinx-theme!=0.14.2 - pytest - pytest-benchmark -- pytest-cases<3.8.2 +- pytest-cases>=3.8.2 - pytest-cov - pytest-xdist - python-confluent-kafka>=1.9.0,<1.10.0a0 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 2aa64e6384b..1cb8f376f82 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -70,9 +70,10 @@ dependencies: - protobuf>=4.21,<5 - pyarrow==14.0.1.* - pydata-sphinx-theme!=0.14.2 +- pynvjitlink - pytest - pytest-benchmark -- pytest-cases<3.8.2 +- pytest-cases>=3.8.2 - pytest-cov - pytest-xdist - python-confluent-kafka>=1.9.0,<1.10.0a0 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index bc91ee61f6f..4f39a9fe452 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -98,6 +98,7 @@ requirements: # xref: https://github.com/rapidsai/cudf/issues/12822 - cuda-nvrtc - cuda-python >=12.0,<13.0a0 + - pynvjitlink {% endif %} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} - nvtx >=0.2.1 diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index bb7529bb37a..0ea13957868 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -540,7 +540,7 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons return cudf::make_strings_column( num_rows, std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), - std::make_unique(std::move(chars), rmm::device_buffer{}, 0), + chars.release(), null_count, profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{}); } diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu index c74701445f8..020c8e413b3 100644 --- a/cpp/benchmarks/json/json.cu +++ b/cpp/benchmarks/json/json.cu @@ -177,10 +177,10 @@ auto build_json_string_column(int desired_bytes, int num_rows) auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2)); json_benchmark_row_builder jb{ desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order}; - auto children = cudf::strings::detail::make_strings_children( + auto [offsets, chars] = cudf::strings::detail::make_strings_children( jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); return cudf::make_strings_column( - num_rows, std::move(children.first), std::move(children.second), 0, {}); + num_rows, std::move(offsets), std::move(chars->release().data.release()[0]), 0, {}); } void BM_case(benchmark::State& state, std::string query_arg) diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh index 6f0b199ff12..64e14dcc549 100644 --- a/cpp/include/cudf/strings/detail/copy_if_else.cuh +++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -109,7 +109,7 @@ std::unique_ptr copy_if_else(StringIterLeft lhs_begin, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh index 5da3addd9a4..567452bac4e 100644 --- a/cpp/include/cudf/strings/detail/copy_range.cuh +++ b/cpp/include/cudf/strings/detail/copy_range.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -205,7 +205,7 @@ std::unique_ptr copy_range(SourceValueIterator source_value_begin, return make_strings_column(target.size(), std::move(p_offsets_column), - std::move(p_chars_column), + std::move(p_chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index f7d2ebebe9a..442155380a2 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -321,7 +321,7 @@ std::unique_ptr gather(strings_column_view const& strings, return make_strings_column(output_count, std::move(out_offsets_column), - std::move(out_chars_column), + std::move(out_chars_column->release().data.release()[0]), 0, // caller sets these rmm::device_buffer{}); } diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh index aef1fe93792..8049895c3c2 100644 --- a/cpp/include/cudf/strings/detail/merge.cuh +++ b/cpp/include/cudf/strings/detail/merge.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -89,9 +89,8 @@ std::unique_ptr merge(strings_column_view const& lhs, auto d_offsets = offsets_column->view().template data(); // create the chars column - auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); - // merge the strings - auto d_chars = chars_column->mutable_view().template data(); + rmm::device_uvector chars(bytes, stream, mr); + auto d_chars = chars.data(); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), strings_count, @@ -103,11 +102,8 @@ std::unique_ptr merge(strings_column_view const& lhs, memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes()); }); - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), - null_count, - std::move(null_mask)); + return make_strings_column( + strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask)); } } // namespace detail diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index de7db4ce47b..fcbdfa619f4 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -137,7 +137,7 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } @@ -187,13 +187,12 @@ std::unique_ptr make_strings_column(CharIterator chars_begin, [] __device__(auto offset) { return static_cast(offset); })); // build chars column - auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); - auto chars_view = chars_column->mutable_view(); - thrust::copy(rmm::exec_policy(stream), chars_begin, chars_end, chars_view.data()); + rmm::device_uvector chars_data(bytes, stream, mr); + thrust::copy(rmm::exec_policy(stream), chars_begin, chars_end, chars_data.begin()); return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + chars_data.release(), null_count, std::move(null_mask)); } diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index abcd89c3035..c4fa4be0f89 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -757,20 +757,21 @@ class strings_column_wrapper : public detail::column_wrapper { strings_column_wrapper(StringsIterator begin, StringsIterator end) : column_wrapper{} { size_type num_strings = std::distance(begin, end); + if (num_strings == 0) { + wrapped = cudf::make_empty_column(cudf::type_id::STRING); + return; + } auto all_valid = thrust::make_constant_iterator(true); auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid); - auto d_chars = std::make_unique( - cudf::detail::make_device_uvector_sync( - chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0); + auto d_chars = cudf::detail::make_device_uvector_async( + chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_offsets = std::make_unique( cudf::detail::make_device_uvector_sync( offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), rmm::device_buffer{}, 0); wrapped = - cudf::make_strings_column(num_strings, std::move(d_offsets), std::move(d_chars), 0, {}); + cudf::make_strings_column(num_strings, std::move(d_offsets), d_chars.release(), 0, {}); } /** @@ -805,23 +806,24 @@ class strings_column_wrapper : public detail::column_wrapper { strings_column_wrapper(StringsIterator begin, StringsIterator end, ValidityIterator v) : column_wrapper{} { - size_type num_strings = std::distance(begin, end); + size_type num_strings = std::distance(begin, end); + if (num_strings == 0) { + wrapped = cudf::make_empty_column(cudf::type_id::STRING); + return; + } auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v); auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings); - auto d_chars = std::make_unique( - cudf::detail::make_device_uvector_sync( - chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0); + auto d_chars = cudf::detail::make_device_uvector_async( + chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_offsets = std::make_unique( - cudf::detail::make_device_uvector_sync( + cudf::detail::make_device_uvector_async( offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()), rmm::device_buffer{}, 0); auto d_bitmask = cudf::detail::make_device_uvector_sync( null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); wrapped = cudf::make_strings_column( - num_strings, std::move(d_offsets), std::move(d_chars), null_count, d_bitmask.release()); + num_strings, std::move(d_offsets), d_chars.release(), null_count, d_bitmask.release()); } /** diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu index 8fc3e63bc59..002c9a9137b 100644 --- a/cpp/src/hash/md5_hash.cu +++ b/cpp/src/hash/md5_hash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -333,9 +333,8 @@ std::unique_ptr md5(table_view const& input, auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr); - auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); - auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.data(); + rmm::device_uvector chars(bytes, stream, mr); + auto d_chars = chars.data(); auto const device_input = table_device_view::create(input, stream); @@ -366,8 +365,7 @@ std::unique_ptr md5(table_view const& input, } }); - return make_strings_column( - input.num_rows(), std::move(offsets_column), std::move(chars_column), 0, {}); + return make_strings_column(input.num_rows(), std::move(offsets_column), chars.release(), 0, {}); } } // namespace detail diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index e39625c92e7..7b44fb41288 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -290,7 +290,7 @@ std::unique_ptr dispatch_to_cudf_column::operator()( auto const num_rows = offsets_column->size() - 1; auto out_col = make_strings_column(num_rows, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), array.null_count(), std::move(*get_mask_buffer(array, stream, mr))); diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu index 66143d3fdee..f4d32edac89 100644 --- a/cpp/src/io/csv/durations.cu +++ b/cpp/src/io/csv/durations.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -202,7 +202,7 @@ struct dispatch_from_durations_fn { // return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), durations.null_count(), std::move(null_mask)); } diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 995d8d942c9..65473073e31 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -181,11 +181,12 @@ struct column_to_strings_fn { auto d_column = column_device_view::create(column_v, stream_); escape_strings_fn fn{*d_column, delimiter.value(stream_)}; - auto children = cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_); + auto [offsets_column, chars_column] = + cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_); return make_strings_column(column_v.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), column_v.null_count(), cudf::detail::copy_bitmask(column_v, stream_, mr_)); } diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu index 5580628b0fe..d461f27c921 100644 --- a/cpp/src/io/json/legacy/reader_impl.cu +++ b/cpp/src/io/json/legacy/reader_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -530,29 +530,27 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, auto repl_chars = std::vector{'"', '\\', '\t', '\r', '\b'}; auto repl_offsets = std::vector{0, 1, 2, 3, 4, 5}; - auto target = make_strings_column( - static_cast(target_offsets.size() - 1), - std::make_unique( - cudf::detail::make_device_uvector_async( - target_offsets, stream, rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0), - std::make_unique(cudf::detail::make_device_uvector_async( - target_chars, stream, rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0), - 0, - {}); + auto target = + make_strings_column(static_cast(target_offsets.size() - 1), + std::make_unique( + cudf::detail::make_device_uvector_async( + target_offsets, stream, rmm::mr::get_current_device_resource()), + rmm::device_buffer{}, + 0), + cudf::detail::make_device_uvector_async( + target_chars, stream, rmm::mr::get_current_device_resource()) + .release(), + 0, + {}); auto repl = make_strings_column( static_cast(repl_offsets.size() - 1), std::make_unique(cudf::detail::make_device_uvector_async( repl_offsets, stream, rmm::mr::get_current_device_resource()), rmm::device_buffer{}, 0), - std::make_unique(cudf::detail::make_device_uvector_async( - repl_chars, stream, rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0), + cudf::detail::make_device_uvector_async( + repl_chars, stream, rmm::mr::get_current_device_resource()) + .release(), 0, {}); diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index c35f15049bd..84e0ac9e74d 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -170,12 +170,12 @@ struct escape_strings_fn { rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto children = + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr); return make_strings_column(column_v.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), column_v.null_count(), cudf::detail::copy_bitmask(column_v, stream, mr)); } @@ -347,13 +347,11 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, d_strview_offsets + row_string_offsets.size(), old_offsets.begin(), row_string_offsets.begin()); - auto chars_data = joined_col->release().data; - auto const chars_size = chars_data->size(); + auto chars_data = joined_col->release().data; return make_strings_column( strings_columns.num_rows(), std::make_unique(std::move(row_string_offsets), rmm::device_buffer{}, 0), - std::make_unique( - data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0), + std::move(chars_data.release()[0]), 0, {}); } @@ -472,13 +470,11 @@ std::unique_ptr join_list_of_strings(lists_column_view const& lists_stri d_strview_offsets.end(), old_offsets.begin(), row_string_offsets.begin()); - auto chars_data = joined_col->release().data; - auto const chars_size = chars_data->size(); + auto chars_data = joined_col->release().data; return make_strings_column( num_lists, std::make_unique(std::move(row_string_offsets), rmm::device_buffer{}, 0), - std::make_unique( - data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0), + std::move(chars_data.release()[0]), lists_strings.null_count(), cudf::detail::copy_bitmask(lists_strings.parent(), stream, mr)); } @@ -780,11 +776,7 @@ std::unique_ptr make_strings_column_from_host(host_span(std::move(d_chars), rmm::device_buffer{}, 0), - 0, - {}); + host_strings.size(), std::move(d_offsets), d_chars.release(), 0, {}); } std::unique_ptr make_column_names_column(host_span column_names, diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 9c8b03886b5..f43a8fd24c4 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -189,7 +189,7 @@ struct stats_caster { return cudf::make_strings_column( val.size(), std::make_unique(std::move(d_offsets), rmm::device_buffer{}, 0), - std::make_unique(std::move(d_chars), rmm::device_buffer{}, 0), + d_chars.release(), null_count, rmm::device_buffer{ null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr}); diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 2194ee1aaa1..34a476974e4 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -556,7 +556,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source return cudf::make_strings_column( string_count, std::make_unique(std::move(offsets), rmm::device_buffer{}, 0), - std::make_unique(std::move(chars), rmm::device_buffer{}, 0), + chars.release(), 0, {}); } diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp index 88617510394..36303a60aa9 100644 --- a/cpp/src/io/utilities/column_buffer.cpp +++ b/cpp/src/io/utilities/column_buffer.cpp @@ -68,26 +68,10 @@ std::unique_ptr cudf::io::detail::inline_column_buffer::make_string_colu rmm::cuda_stream_view stream) { // no need for copies, just transfer ownership of the data_buffers to the columns - auto const state = mask_state::UNALLOCATED; - auto str_col = - _string_data.is_empty() - ? make_empty_column(data_type{type_id::INT8}) - : std::make_unique(data_type{type_id::INT8}, - string_size(), - std::move(_string_data), - cudf::detail::create_null_mask(size, state, stream, _mr), - state_null_count(state, size), - std::vector>{}); - auto offsets_col = - std::make_unique(data_type{type_to_id()}, - size + 1, - std::move(_data), - cudf::detail::create_null_mask(size + 1, state, stream, _mr), - state_null_count(state, size + 1), - std::vector>{}); - + auto offsets_col = std::make_unique( + data_type{type_to_id()}, size + 1, std::move(_data), rmm::device_buffer{}, 0); return make_strings_column( - size, std::move(offsets_col), std::move(str_col), null_count(), std::move(_null_mask)); + size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask)); } namespace { diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 9545811a542..8fd860d9492 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -861,9 +861,8 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, std::overflow_error); // CHARS column - std::unique_ptr chars = - strings::detail::create_chars_child_column(static_cast(bytes), stream, mr); - auto d_chars = chars->mutable_view().data(); + rmm::device_uvector chars(bytes, stream, mr); + auto d_chars = chars.data(); single_thread_fn.d_chars = d_chars; thrust::for_each_n(rmm::exec_policy(stream), @@ -902,7 +901,7 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, return make_strings_column(col_size, std::move(offsets), - std::move(chars), + chars.release(), d_null_count.value(stream), std::move(null_mask)); } diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu index 6794838c70f..146b54c0d87 100644 --- a/cpp/src/json/json_path.cu +++ b/cpp/src/json/json_path.cu @@ -1010,7 +1010,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c cudf::detail::get_value(offsets_view, col.size(), stream); // allocate output string column - auto chars = cudf::strings::detail::create_chars_child_column(output_size, stream, mr); + rmm::device_uvector chars(output_size, stream, mr); // potential optimization : if we know that all outputs are valid, we could skip creating // the validity mask altogether @@ -1018,7 +1018,6 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c cudf::detail::create_null_mask(col.size(), mask_state::UNINITIALIZED, stream, mr); // compute results - cudf::mutable_column_view chars_view(*chars); rmm::device_scalar d_valid_count{0, stream}; get_json_object_kernel @@ -1026,14 +1025,14 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c *cdv, std::get<0>(preprocess).value().data(), offsets_view.head(), - chars_view.head(), + chars.data(), static_cast(validity.data()), d_valid_count.data(), options); auto result = make_strings_column(col.size(), std::move(offsets), - std::move(chars), + chars.release(), col.size() - d_valid_count.value(stream), std::move(validity)); // unmatched array query may result in unsanitized '[' value in the result diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu index 7b37e2dc8f6..8f05b020a2e 100644 --- a/cpp/src/lists/interleave_columns.cu +++ b/cpp/src/lists/interleave_columns.cu @@ -237,7 +237,7 @@ struct interleave_list_entries_implrelease().data.release()[0]), null_count, std::move(null_mask)); } diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu index 6852b19af44..23c792ddcae 100644 --- a/cpp/src/replace/clamp.cu +++ b/cpp/src/replace/clamp.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -106,7 +106,7 @@ std::unique_ptr clamp_string_column(strings_column_view const& inp return make_strings_column(input.size(), std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), std::move(cudf::detail::copy_bitmask(input.parent(), stream, mr))); } diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu index bd3e75e2e80..8ea229368cc 100644 --- a/cpp/src/replace/nulls.cu +++ b/cpp/src/replace/nulls.cu @@ -255,22 +255,19 @@ std::unique_ptr replace_nulls_column_kernel_forwarder::operator()< auto offsets_view = offsets->mutable_view(); // Allocate chars array and output null mask - std::unique_ptr output_chars = - cudf::strings::detail::create_chars_child_column(bytes, stream, mr); - - auto output_chars_view = output_chars->mutable_view(); + rmm::device_uvector output_chars(bytes, stream, mr); replace_second<<>>( *device_in, *device_replacement, reinterpret_cast(valid_bits.data()), offsets_view.begin(), - output_chars_view.data(), + output_chars.data(), valid_count); return cudf::make_strings_column(input.size(), std::move(offsets), - std::move(output_chars), + output_chars.release(), input.size() - valid_counter.value(stream), std::move(valid_bits)); } diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu index 7cad2fb10d3..184c30246c7 100644 --- a/cpp/src/replace/replace.cu +++ b/cpp/src/replace/replace.cu @@ -187,7 +187,7 @@ template CUDF_KERNEL void replace_strings_second_pass(cudf::column_device_view input, cudf::column_device_view replacement, cudf::mutable_column_device_view offsets, - cudf::mutable_column_device_view strings, + char* strings, cudf::mutable_column_device_view indices) { cudf::size_type nrows = input.size(); @@ -211,9 +211,8 @@ CUDF_KERNEL void replace_strings_second_pass(cudf::column_device_view input, cudf::string_view output = (replace_idx == -1) ? input.element(idx) : replacement.element(replace_idx); - std::memcpy(strings.data() + offsets.data()[idx], - output.data(), - output.size_bytes()); + std::memcpy( + strings + offsets.data()[idx], output.data(), output.size_bytes()); } tid += stride; @@ -434,18 +433,15 @@ std::unique_ptr replace_kernel_forwarder::operator() output_chars = - cudf::strings::detail::create_chars_child_column(bytes, stream, mr); - - auto output_chars_view = output_chars->mutable_view(); - auto device_chars = cudf::mutable_column_device_view::create(output_chars_view, stream); + rmm::device_uvector output_chars(bytes, stream, mr); + auto d_chars = output_chars.data(); replace_second<<>>( - *device_in, *device_replacement, *device_offsets, *device_chars, *device_indices); + *device_in, *device_replacement, *device_offsets, d_chars, *device_indices); return cudf::make_strings_column(input_col.size(), std::move(offsets), - std::move(output_chars), + output_chars.release(), null_count, std::move(valid_bits)); } diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu index deb0acb4742..22b45fe7a58 100644 --- a/cpp/src/reshape/interleave_columns.cu +++ b/cpp/src/reshape/interleave_columns.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -193,9 +193,8 @@ struct interleave_columns_implview().template data(); // Create the chars column - auto chars_column = strings::detail::create_chars_child_column(bytes, stream, mr); - // Fill the chars column - auto d_results_chars = chars_column->mutable_view().template data(); + rmm::device_uvector chars(bytes, stream, mr); + auto d_results_chars = chars.data(); thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(0), @@ -215,7 +214,7 @@ struct interleave_columns_impl capitalizer(CapitalFn cfn, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto children = cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr); + auto [offsets_column, chars_column] = + cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index c2e8033b42d..b3bf0e2a787 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -217,7 +217,7 @@ std::unique_ptr convert_case(strings_column_view const& input, cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr); return make_strings_column(input.size(), std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -264,15 +264,15 @@ std::unique_ptr convert_case(strings_column_view const& input, "Size of output exceeds the column size limit", std::overflow_error); - auto chars = create_chars_child_column(static_cast(bytes), stream, mr); + rmm::device_uvector chars(bytes, stream, mr); // second pass, write output converter.d_offsets = d_offsets; - converter.d_chars = chars->mutable_view().data(); + converter.d_chars = chars.data(); thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter); return make_strings_column(input.size(), std::move(offsets), - std::move(chars), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu index 35b0c0a2690..9c2a2701227 100644 --- a/cpp/src/strings/char_types/char_types.cu +++ b/cpp/src/strings/char_types/char_types.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -200,12 +200,13 @@ std::unique_ptr filter_characters_of_type(strings_column_view const& str rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // this utility calls filterer to build the offsets and chars columns - auto children = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); + auto [offsets_column, chars_column] = + cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); // return new strings column return make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu index 0a11b6dc460..a48e84eac0c 100644 --- a/cpp/src/strings/combine/concatenate.cu +++ b/cpp/src/strings/combine/concatenate.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -142,7 +142,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, // Create device views from the strings columns. auto d_table = table_device_view::create(strings_columns, stream); concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls}; - auto children = make_strings_children(fn, strings_count, stream, mr); + auto [offsets_column, chars_column] = make_strings_children(fn, strings_count, stream, mr); // create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( @@ -157,8 +157,8 @@ std::unique_ptr concatenate(table_view const& strings_columns, mr); return make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } @@ -237,7 +237,7 @@ std::unique_ptr concatenate(table_view const& strings_columns, multi_separator_concat_fn mscf{ *d_table, separator_col_view, separator_rep, col_rep, separate_nulls}; - auto children = make_strings_children(mscf, strings_count, stream, mr); + auto [offsets_column, chars_column] = make_strings_children(mscf, strings_count, stream, mr); // Create resulting null mask auto [null_mask, null_count] = cudf::detail::valid_if( @@ -253,8 +253,8 @@ std::unique_ptr concatenate(table_view const& strings_columns, mr); return make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index 48304759f7a..0e0d6e437a7 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -173,8 +173,11 @@ std::unique_ptr join_strings(strings_column_view const& input, : rmm::device_buffer{0, stream, mr}; // perhaps this return a string_scalar instead of a single-row column - return make_strings_column( - 1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask)); + return make_strings_column(1, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + null_count, + std::move(null_mask)); } } // namespace detail diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu index 372b49fb0ee..619f5feba15 100644 --- a/cpp/src/strings/combine/join_list_elements.cu +++ b/cpp/src/strings/combine/join_list_elements.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -215,8 +215,11 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string stream, mr); - return make_strings_column( - num_rows, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask)); + return make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + null_count, + std::move(null_mask)); } namespace { @@ -290,8 +293,11 @@ std::unique_ptr join_list_elements(lists_column_view const& lists_string stream, mr); - return make_strings_column( - num_rows, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask)); + return make_strings_column(num_rows, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + null_count, + std::move(null_mask)); } } // namespace detail diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu index e75f1a6fe0f..4fe0be7883f 100644 --- a/cpp/src/strings/convert/convert_booleans.cu +++ b/cpp/src/strings/convert/convert_booleans.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -147,7 +147,7 @@ std::unique_ptr from_booleans(column_view const& booleans, return make_strings_column(strings_count, std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), booleans.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index d2609441d72..b7a662b0b76 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1158,7 +1158,7 @@ std::unique_ptr from_timestamps(column_view const& timestamps, return make_strings_column(timestamps.size(), std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), timestamps.null_count(), cudf::detail::copy_bitmask(timestamps, stream, mr)); } diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu index 987087042cb..9a58926539c 100644 --- a/cpp/src/strings/convert/convert_durations.cu +++ b/cpp/src/strings/convert/convert_durations.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -422,7 +422,7 @@ struct dispatch_from_durations_fn { return make_strings_column(strings_count, std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), durations.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu index 2c59f6dcd29..975f03b37d6 100644 --- a/cpp/src/strings/convert/convert_fixed_point.cu +++ b/cpp/src/strings/convert/convert_fixed_point.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -249,7 +249,7 @@ struct dispatch_from_fixed_point_fn { return make_strings_column(input.size(), std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input, stream, mr)); } diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu index 9b3ef8f452b..c56e723de8e 100644 --- a/cpp/src/strings/convert/convert_floats.cu +++ b/cpp/src/strings/convert/convert_floats.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -408,7 +408,7 @@ struct dispatch_from_floats_fn { return make_strings_column(strings_count, std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), floats.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu index 8f656b149a5..68cff214507 100644 --- a/cpp/src/strings/convert/convert_hex.cu +++ b/cpp/src/strings/convert/convert_hex.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -180,12 +180,12 @@ struct dispatch_integers_to_hex_fn { { auto const d_column = column_device_view::create(input, stream); - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( integer_to_hex_fn{*d_column}, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input, stream, mr)); } diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu index 56637e88e19..364cb534d2f 100644 --- a/cpp/src/strings/convert/convert_integers.cu +++ b/cpp/src/strings/convert/convert_integers.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -367,7 +367,7 @@ struct dispatch_from_integers_fn { return make_strings_column(strings_count, std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), integers.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu index 75527e24e79..e07be26a23c 100644 --- a/cpp/src/strings/convert/convert_ipv4.cu +++ b/cpp/src/strings/convert/convert_ipv4.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -165,13 +165,13 @@ std::unique_ptr integers_to_ipv4(column_view const& integers, CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type"); - auto d_column = column_device_view::create(integers, stream); - auto children = cudf::strings::detail::make_strings_children( + auto d_column = column_device_view::create(integers, stream); + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr); return make_strings_column(integers.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), integers.null_count(), cudf::detail::copy_bitmask(integers, stream, mr)); } diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu index f9f2b91eb12..1f22aea284b 100644 --- a/cpp/src/strings/convert/convert_lists.cu +++ b/cpp/src/strings/convert/convert_lists.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -216,14 +216,17 @@ std::unique_ptr format_list_column(lists_column_view const& input, auto const d_separators = column_device_view::create(separators.parent(), stream); auto const d_na_rep = na_rep.value(stream); - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth}, input.size(), stream, mr); - return make_strings_column( - input.size(), std::move(children.first), std::move(children.second), 0, rmm::device_buffer{}); + return make_strings_column(input.size(), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + 0, + rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu index b16eb318b39..a9ddcfa12a2 100644 --- a/cpp/src/strings/convert/convert_urls.cu +++ b/cpp/src/strings/convert/convert_urls.cu @@ -135,12 +135,12 @@ std::unique_ptr url_encode(strings_column_view const& input, auto d_column = column_device_view::create(input.parent(), stream); - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( url_encoder_fn{*d_column}, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -409,8 +409,8 @@ std::unique_ptr url_decode(strings_column_view const& strings, auto out_chars_bytes = cudf::detail::get_value(offsets_view, offset_count - 1, stream); // create the chars column - auto chars_column = create_chars_child_column(out_chars_bytes, stream, mr); - auto d_out_chars = chars_column->mutable_view().data(); + rmm::device_uvector chars(out_chars_bytes, stream, mr); + auto d_out_chars = chars.data(); // decode and copy the characters from the input column to the output column url_decode_char_replacer @@ -422,7 +422,7 @@ std::unique_ptr url_decode(strings_column_view const& strings, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + chars.release(), strings.null_count(), std::move(null_mask)); } diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 8cabd0dc75f..c4564b1105b 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -228,9 +228,8 @@ std::unique_ptr concatenate(host_span columns, std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); }); // create output chars column - auto chars_column = create_chars_child_column(total_bytes, stream, mr); - auto d_new_chars = chars_column->mutable_view().data(); - chars_column->set_null_count(0); + rmm::device_uvector output_chars(total_bytes, stream, mr); + auto d_new_chars = output_chars.data(); // create output offsets column auto offsets_column = make_numeric_column( @@ -304,7 +303,7 @@ std::unique_ptr concatenate(host_span columns, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + output_chars.release(), null_count, std::move(null_mask)); } diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu index 3b798a87d54..331cdecc36f 100644 --- a/cpp/src/strings/copying/shift.cu +++ b/cpp/src/strings/copying/shift.cu @@ -114,19 +114,19 @@ std::unique_ptr shift(strings_column_view const& input, }(); // create output chars child column - auto chars_column = create_chars_child_column(static_cast(total_bytes), stream, mr); - auto d_chars = mutable_column_device_view::create(chars_column->mutable_view(), stream); + rmm::device_uvector chars(total_bytes, stream, mr); + auto d_chars = chars.data(); // run kernel to shift all the characters thrust::transform(rmm::exec_policy(stream), thrust::counting_iterator(0), thrust::counting_iterator(total_bytes), - d_chars->data(), + d_chars, shift_chars_fn{*d_input, d_fill_str, shift_offset}); // caller sets the null-mask return make_strings_column( - input.size(), std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + input.size(), std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); } } // namespace cudf::strings::detail diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu index 49e1b11c1db..d2e3b6f6af3 100644 --- a/cpp/src/strings/filling/fill.cu +++ b/cpp/src/strings/filling/fill.cu @@ -98,7 +98,7 @@ std::unique_ptr fill(strings_column_view const& input, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu index 9f95fedfe0b..7a26fc45dcb 100644 --- a/cpp/src/strings/filter_chars.cu +++ b/cpp/src/strings/filter_chars.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -139,11 +139,12 @@ std::unique_ptr filter_characters( // this utility calls the strip_fn to build the offsets and chars columns filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement}; - auto children = cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr); + auto [offsets_column, chars_column] = + cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr); return make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index 850ccaa4535..ec77aea6338 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -112,7 +112,7 @@ std::unique_ptr pad(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); - auto children = [&] { + auto [offsets_column, chars_column] = [&] { if (side == side_type::LEFT) { auto fn = pad_fn{*d_strings, width, fill_char_size, d_fill_char}; return make_strings_children(fn, input.size(), stream, mr); @@ -125,8 +125,8 @@ std::unique_ptr pad(strings_column_view const& input, }(); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -151,11 +151,12 @@ std::unique_ptr zfill(strings_column_view const& input, if (input.is_empty()) return make_empty_column(type_id::STRING); auto d_strings = column_device_view::create(input.parent(), stream); - auto children = make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr); + auto [offsets_column, chars_column] = + make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu index 847a64f5602..b4a770f72bd 100644 --- a/cpp/src/strings/repeat_strings.cu +++ b/cpp/src/strings/repeat_strings.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -81,8 +81,6 @@ auto generate_empty_output(strings_column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto chars_column = create_chars_child_column(0, stream, mr); - auto offsets_column = make_numeric_column( data_type{type_to_id()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); CUDF_CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data(), @@ -92,7 +90,7 @@ auto generate_empty_output(strings_column_view const& input, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + rmm::device_buffer{}, input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -166,7 +164,7 @@ std::unique_ptr repeat_strings(strings_column_view const& input, make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr); return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -252,7 +250,7 @@ std::unique_ptr repeat_strings(strings_column_view const& input, return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + std::move(chars_column->release().data.release()[0]), null_count, std::move(null_mask)); } diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index fc11b7d80b3..edec525a913 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -126,8 +126,8 @@ std::unique_ptr replace_with_backrefs(strings_column_view const& input, auto const d_strings = column_device_view::create(input.parent(), stream); - using BackRefIterator = decltype(backrefs.begin()); - auto children = make_strings_children( + using BackRefIterator = decltype(backrefs.begin()); + auto [offsets_column, chars_column] = make_strings_children( backrefs_fn{*d_strings, d_repl_template, backrefs.begin(), backrefs.end()}, *d_prog, input.size(), @@ -135,8 +135,8 @@ std::unique_ptr replace_with_backrefs(strings_column_view const& input, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu index a0f9d1136f3..3d0210d61b0 100644 --- a/cpp/src/strings/replace/multi.cu +++ b/cpp/src/strings/replace/multi.cu @@ -374,12 +374,8 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in }); // use this utility to gather the string parts into a contiguous chars column - auto chars = make_strings_column(indices.begin(), indices.end(), stream, mr); - // TODO ideally we can pass this chars_data as rmm buffer to make_strings_column - auto chars_data = chars->release().data; - auto const chars_size = chars_data->size(); - auto chars_col = std::make_unique( - data_type{type_id::INT8}, chars_size, std::move(*chars_data), rmm::device_buffer{}, 0); + auto chars = make_strings_column(indices.begin(), indices.end(), stream, mr); + auto chars_data = chars->release().data; // create offsets from the sizes offsets = @@ -388,7 +384,7 @@ std::unique_ptr replace_character_parallel(strings_column_view const& in // build the strings columns from the chars and offsets return make_strings_column(strings_count, std::move(offsets), - std::move(chars_col), + std::move(chars_data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } @@ -458,12 +454,12 @@ std::unique_ptr replace_string_parallel(strings_column_view const& input auto d_targets = column_device_view::create(targets.parent(), stream); auto d_replacements = column_device_view::create(repls.parent(), stream); - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index 3375cb7a789..c212d9f44ba 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -185,15 +185,15 @@ std::unique_ptr replace_re(strings_column_view const& input, auto found_ranges = rmm::device_uvector(d_progs.size() * input.size(), stream); - auto children = make_strings_children( + auto [offsets_column, chars_column] = make_strings_children( replace_multi_regex_fn{*d_strings, d_progs, found_ranges.data(), *d_repls}, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu index 8c4bd4490b9..936127f254b 100644 --- a/cpp/src/strings/replace/replace.cu +++ b/cpp/src/strings/replace/replace.cu @@ -486,9 +486,8 @@ std::unique_ptr replace_char_parallel(strings_column_view const& strings offsets_update_fn); // build the characters column - auto chars_column = - create_chars_child_column(chars_bytes + (delta_per_target * target_count), stream, mr); - auto d_out_chars = chars_column->mutable_view().data(); + rmm::device_uvector chars(chars_bytes + (delta_per_target * target_count), stream, mr); + auto d_out_chars = chars.data(); thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(chars_start), @@ -501,7 +500,7 @@ std::unique_ptr replace_char_parallel(strings_column_view const& strings return make_strings_column(strings_count, std::move(offsets_column), - std::move(chars_column), + chars.release(), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } @@ -532,12 +531,12 @@ std::unique_ptr replace_row_parallel(strings_column_view const& strings, auto d_strings = column_device_view::create(strings.parent(), stream); // this utility calls the given functor to build the offsets and chars columns - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl}, strings.size(), stream, mr); return make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } @@ -697,12 +696,12 @@ std::unique_ptr replace_slice(strings_column_view const& strings, auto d_strings = column_device_view::create(strings.parent(), stream); // this utility calls the given functor to build the offsets and chars columns - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr); return make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } @@ -733,8 +732,8 @@ std::unique_ptr replace_nulls(strings_column_view const& strings, auto d_offsets = offsets_column->view().data(); // build chars column - auto chars_column = create_chars_child_column(bytes, stream, mr); - auto d_chars = chars_column->mutable_view().data(); + rmm::device_uvector chars(bytes, stream, mr); + auto d_chars = chars.data(); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), strings_count, @@ -745,7 +744,7 @@ std::unique_ptr replace_nulls(strings_column_view const& strings, }); return make_strings_column( - strings_count, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + strings_count, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index 502d5f1a52e..10d83932928 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -116,12 +116,12 @@ std::unique_ptr replace_re(strings_column_view const& input, auto const d_strings = column_device_view::create(input.parent(), stream); - auto children = make_strings_children( + auto [offsets_column, chars_column] = make_strings_children( replace_regex_fn{*d_strings, d_repl, maxrepl}, *d_prog, input.size(), stream, mr); return make_strings_column(input.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu index 5a1fee92c7d..1e55986fdb8 100644 --- a/cpp/src/strings/slice.cu +++ b/cpp/src/strings/slice.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -209,7 +209,7 @@ std::unique_ptr slice_strings(strings_column_view const& strings, return make_strings_column(strings.size(), std::move(offsets), - std::move(chars), + std::move(chars->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index 0ca5e103d3d..039a8ac8a62 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -110,12 +110,12 @@ std::unique_ptr translate(strings_column_view const& strings, auto d_strings = column_device_view::create(strings.parent(), stream); - auto children = make_strings_children( + auto [offsets_column, chars_column] = make_strings_children( translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr); return make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu index 1f125636208..c6d299424d2 100644 --- a/cpp/src/text/bpe/byte_pair_encoding.cu +++ b/cpp/src/text/bpe/byte_pair_encoding.cu @@ -429,8 +429,8 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const std::overflow_error); // build the output: inserting separators to the input character data - auto chars = cudf::strings::detail::create_chars_child_column(bytes, stream, mr); - auto d_chars = chars->mutable_view().data(); + rmm::device_uvector chars(bytes, stream, mr); + auto d_chars = chars.data(); auto const d_inserts = d_working.data(); // stores the insert positions auto offsets_at_non_zero = [d_spaces = d_spaces.data()] __device__(auto idx) { @@ -453,7 +453,7 @@ std::unique_ptr byte_pair_encoding(cudf::strings_column_view const return cudf::make_strings_column(input.size(), std::move(offsets), - std::move(chars), + chars.release(), input.null_count(), cudf::detail::copy_bitmask(input.parent(), stream, mr)); } diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu index 38cb7dd6753..60625d6383a 100644 --- a/cpp/src/text/detokenize.cu +++ b/cpp/src/text/detokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -156,15 +156,18 @@ std::unique_ptr detokenize(cudf::strings_column_view const& string cudf::string_view const d_separator(separator.data(), separator.size()); - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator}, output_count, stream, mr); // make the output strings column from the offsets and chars column - return cudf::make_strings_column( - output_count, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{}); + return cudf::make_strings_column(output_count, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + 0, + rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 1d3e98a25ad..882d9a04501 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -139,12 +139,15 @@ std::unique_ptr generate_ngrams(cudf::strings_column_view const& s // compute the number of strings of ngrams auto const ngrams_count = strings_count - ngrams + 1; - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr); // make the output strings column from the offsets and chars column - return cudf::make_strings_column( - ngrams_count, std::move(children.first), std::move(children.second), 0, rmm::device_buffer{}); + return cudf::make_strings_column(ngrams_count, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + 0, + rmm::device_buffer{}); } } // namespace detail @@ -239,8 +242,11 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( generator, strings_count, total_ngrams, stream, mr); - return cudf::make_strings_column( - total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + return cudf::make_strings_column(total_ngrams, + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), + 0, + rmm::device_buffer{}); } namespace { diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu index bc5cd04eac6..642dca5fc47 100644 --- a/cpp/src/text/ngrams_tokenize.cu +++ b/cpp/src/text/ngrams_tokenize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -232,9 +232,8 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s rmm::device_uvector ngram_sizes(total_ngrams, stream); // build output chars column - auto chars_column = cudf::strings::detail::create_chars_child_column( - static_cast(output_chars_size), stream, mr); - auto d_chars = chars_column->mutable_view().data(); + rmm::device_uvector chars(output_chars_size, stream, mr); + auto d_chars = chars.data(); // Generate the ngrams into the chars column data buffer. // The ngram_builder_fn functor also fills the ngram_sizes vector with the // size of each ngram. @@ -253,11 +252,10 @@ std::unique_ptr ngrams_tokenize(cudf::strings_column_view const& s // build the offsets column -- converting the ngram sizes into offsets auto offsets_column = std::get<0>( cudf::detail::make_offsets_child_column(ngram_sizes.begin(), ngram_sizes.end(), stream, mr)); - chars_column->set_null_count(0); offsets_column->set_null_count(0); // create the output strings column return make_strings_column( - total_ngrams, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 5a0977d410f..d46ca25835f 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -182,12 +182,12 @@ std::unique_ptr normalize_spaces(cudf::strings_column_view const& auto d_strings = cudf::column_device_view::create(strings.parent(), stream); // build offsets and children using the normalize_space_fn - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( normalize_spaces_fn{*d_strings}, strings.size(), stream, mr); return cudf::make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } @@ -228,12 +228,12 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto d_strings = cudf::column_device_view::create(strings.parent(), stream); // build offsets and children using the codepoint_to_utf8_fn - auto children = cudf::strings::detail::make_strings_children( + auto [offsets_column, chars_column] = cudf::strings::detail::make_strings_children( codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr); return cudf::make_strings_column(strings.size(), - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), cudf::detail::copy_bitmask(strings.parent(), stream, mr)); } diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index a4b28fe2dab..50d7bbd077d 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -228,12 +228,13 @@ std::unique_ptr replace_tokens(cudf::strings_column_view const& st rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // this utility calls replacer to build the offsets and chars columns - auto children = cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr); + auto [offsets_column, chars_column] = + cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr); // return new strings column return cudf::make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), std::move(null_mask)); } @@ -260,12 +261,13 @@ std::unique_ptr filter_tokens(cudf::strings_column_view const& str rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); // this utility calls filterer to build the offsets and chars columns - auto children = cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); + auto [offsets_column, chars_column] = + cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr); // return new strings column return cudf::make_strings_column(strings_count, - std::move(children.first), - std::move(children.second), + std::move(offsets_column), + std::move(chars_column->release().data.release()[0]), strings.null_count(), std::move(null_mask)); } diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu index c43b9dda56c..c256607fb23 100644 --- a/cpp/src/text/tokenize.cu +++ b/cpp/src/text/tokenize.cu @@ -219,14 +219,13 @@ std::unique_ptr character_tokenize(cudf::strings_column_view const return idx < chars_bytes ? cudf::strings::detail::is_begin_utf8_char(d_chars[idx]) : true; }); - // create the output chars column -- just a copy of the input's chars column - cudf::column_view chars_view( - cudf::data_type{cudf::type_id::INT8}, chars_bytes, d_chars, nullptr, 0); - auto chars_column = std::make_unique(chars_view, stream, mr); + // create the output chars buffer -- just a copy of the input's chars + rmm::device_uvector output_chars(chars_bytes, stream, mr); + thrust::copy(rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, output_chars.data()); // return new strings column return cudf::make_strings_column( - num_characters, std::move(offsets_column), std::move(chars_column), 0, rmm::device_buffer{}); + num_characters, std::move(offsets_column), output_chars.release(), 0, rmm::device_buffer{}); } } // namespace detail diff --git a/cpp/src/transform/row_conversion.cu b/cpp/src/transform/row_conversion.cu index ef12fbeae52..b294369a90e 100644 --- a/cpp/src/transform/row_conversion.cu +++ b/cpp/src/transform/row_conversion.cu @@ -2509,8 +2509,7 @@ std::unique_ptr convert_from_rows(lists_column_view const& input, make_strings_column(num_rows, std::make_unique( std::move(string_col_offsets[string_idx]), rmm::device_buffer{}, 0), - std::make_unique( - std::move(string_data_cols[string_idx]), rmm::device_buffer{}, 0), + string_data_cols[string_idx].release(), 0, std::move(*string_data.null_mask.release())); // Null count set to 0, temporarily. Will be fixed up before return. diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 06fb687ac2d..0f7c1053adf 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -406,9 +407,9 @@ TEST_F(OverflowTest, OverflowTest) // try and concatenate 6 string columns of with 1 billion chars in each auto offsets = cudf::test::fixed_width_column_wrapper{0, size}; - auto many_chars = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, size); + auto many_chars = rmm::device_uvector(size, cudf::get_default_stream()); auto col = cudf::make_strings_column( - 1, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{}); + 1, offsets.release(), many_chars.release(), 0, rmm::device_buffer{}); cudf::table_view tbl({*col}); EXPECT_THROW(cudf::concatenate(std::vector({tbl, tbl, tbl, tbl, tbl, tbl})), @@ -422,7 +423,7 @@ TEST_F(OverflowTest, OverflowTest) // try and concatenate 6 string columns 1 billion rows each auto many_offsets = cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT32}, size + 1); - auto chars = cudf::test::fixed_width_column_wrapper{0, 1, 2}; + auto chars = rmm::device_uvector(3, cudf::get_default_stream()); auto col = cudf::make_strings_column( size, std::move(many_offsets), chars.release(), 0, rmm::device_buffer{}); @@ -533,10 +534,9 @@ TEST_F(OverflowTest, Presliced) auto offset_gen = cudf::detail::make_counting_transform_iterator( 0, [string_size](cudf::size_type index) { return index * string_size; }); cudf::test::fixed_width_column_wrapper offsets(offset_gen, offset_gen + num_rows + 1); - auto many_chars = - cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, total_chars_size); - auto col = cudf::make_strings_column( - num_rows, offsets.release(), std::move(many_chars), 0, rmm::device_buffer{}); + auto many_chars = rmm::device_uvector(total_chars_size, cudf::get_default_stream()); + auto col = cudf::make_strings_column( + num_rows, offsets.release(), many_chars.release(), 0, rmm::device_buffer{}); auto sliced = cudf::split(*col, {(num_rows / 2) - 1}); @@ -557,13 +557,12 @@ TEST_F(OverflowTest, Presliced) constexpr cudf::size_type num_rows = total_chars_size / string_size; // try and concatenate 4 string columns of with ~1/2 billion chars in each - auto offsets = cudf::sequence(num_rows + 1, + auto offsets = cudf::sequence(num_rows + 1, cudf::numeric_scalar(0), cudf::numeric_scalar(string_size)); - auto many_chars = - cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, total_chars_size); - auto col = cudf::make_strings_column( - num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{}); + auto many_chars = rmm::device_uvector(total_chars_size, cudf::get_default_stream()); + auto col = cudf::make_strings_column( + num_rows, std::move(offsets), many_chars.release(), 0, rmm::device_buffer{}); // should pass (with 2 rows to spare) // leaving this disabled as it typically runs out of memory on a T4 @@ -636,7 +635,7 @@ TEST_F(OverflowTest, Presliced) cudf::numeric_scalar(0), cudf::numeric_scalar(list_size)); - auto col = cudf::make_strings_column( + auto col = cudf::make_lists_column( num_rows, std::move(offsets), std::move(struct_col), 0, rmm::device_buffer{}); // should pass (with 2 rows to spare) @@ -722,13 +721,12 @@ TEST_F(OverflowTest, BigColumnsSmallSlices) constexpr cudf::size_type num_rows = 1024; constexpr cudf::size_type string_size = inner_size / num_rows; - auto offsets = cudf::sequence(num_rows + 1, + auto offsets = cudf::sequence(num_rows + 1, cudf::numeric_scalar(0), cudf::numeric_scalar(string_size)); - auto many_chars = - cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::INT8}, inner_size); - auto col = cudf::make_strings_column( - num_rows, std::move(offsets), std::move(many_chars), 0, rmm::device_buffer{}); + auto many_chars = rmm::device_uvector(inner_size, cudf::get_default_stream()); + auto col = cudf::make_strings_column( + num_rows, std::move(offsets), many_chars.release(), 0, rmm::device_buffer{}); auto sliced = cudf::slice(*col, {16, 32}); diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp index 13459197aa3..2d9e2035e5e 100644 --- a/cpp/tests/strings/contains_tests.cpp +++ b/cpp/tests/strings/contains_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -297,17 +297,14 @@ TEST_F(StringsContainsTests, HexTest) std::vector offsets( {thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + count + 1}); - auto d_chars = std::make_unique( - cudf::detail::make_device_uvector_sync( - ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0); + auto d_chars = cudf::detail::make_device_uvector_sync( + ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_offsets = std::make_unique( cudf::detail::make_device_uvector_sync( offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), rmm::device_buffer{}, 0); - auto input = cudf::make_strings_column(count, std::move(d_offsets), std::move(d_chars), 0, {}); + auto input = cudf::make_strings_column(count, std::move(d_offsets), d_chars.release(), 0, {}); auto strings_view = cudf::strings_column_view(input->view()); for (auto ch : ascii_chars) { diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu index 5381ad63bc3..64123690aea 100644 --- a/cpp/tests/strings/factories_test.cu +++ b/cpp/tests/strings/factories_test.cu @@ -145,11 +145,8 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets) } std::vector h_nulls{h_null_mask}; - auto d_buffer = std::make_unique( - cudf::detail::make_device_uvector_sync( - h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0); + auto d_buffer = cudf::detail::make_device_uvector_sync( + h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_offsets = std::make_unique( cudf::detail::make_device_uvector_sync( h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), @@ -158,7 +155,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets) auto d_nulls = cudf::detail::make_device_uvector_sync( h_nulls, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto column = cudf::make_strings_column( - count, std::move(d_offsets), std::move(d_buffer), null_count, d_nulls.release()); + count, std::move(d_offsets), d_buffer.release(), null_count, d_nulls.release()); EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING}); EXPECT_EQ(column->null_count(), null_count); EXPECT_EQ(1, column->num_children()); @@ -197,8 +194,7 @@ TEST_F(StringsFactoriesTest, CreateScalar) TEST_F(StringsFactoriesTest, EmptyStringsColumn) { - auto d_chars = std::make_unique( - rmm::device_uvector{0, cudf::get_default_stream()}, rmm::device_buffer{}, 0); + auto d_chars = rmm::device_uvector(0, cudf::get_default_stream()); auto d_offsets = std::make_unique( cudf::detail::make_zeroed_device_uvector_sync( 1, cudf::get_default_stream(), rmm::mr::get_current_device_resource()), @@ -207,7 +203,7 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn) rmm::device_uvector d_nulls{0, cudf::get_default_stream()}; auto results = - cudf::make_strings_column(0, std::move(d_offsets), std::move(d_chars), 0, d_nulls.release()); + cudf::make_strings_column(0, std::move(d_offsets), d_chars.release(), 0, d_nulls.release()); cudf::test::expect_column_empty(results->view()); rmm::device_uvector> d_strings{ diff --git a/cpp/tests/transform/row_conversion.cpp b/cpp/tests/transform/row_conversion.cpp index e54929f1651..542ccc5e2d5 100644 --- a/cpp/tests/transform/row_conversion.cpp +++ b/cpp/tests/transform/row_conversion.cpp @@ -14,26 +14,20 @@ * limitations under the License. */ +#include +#include +#include +#include + #include #include -#include -#include #include #include #include #include -#include -#include -#include -#include -#include -#include - -#include -#include -#include #include +#include struct ColumnToRowTests : public cudf::test::BaseFixture {}; struct RowToColumnTests : public cudf::test::BaseFixture {}; @@ -833,19 +827,7 @@ TEST_F(RowToColumnTests, SimpleString) EXPECT_EQ(new_rows.size(), 1); for (auto& row : new_rows) { auto new_cols = cudf::convert_from_rows(cudf::lists_column_view(*row), schema); - EXPECT_EQ(row->size(), 5); - auto const num_columns = new_cols->num_columns(); - - cudf::strings_column_view str_col = new_cols->get_column(1).view(); - std::vector> col_data; - std::vector> offset_data; - for (int i = 0; i < num_columns; ++i) { - offset_data.emplace_back( - std::get<0>(cudf::test::to_host(str_col.offsets()))); - col_data.emplace_back(std::get<0>(cudf::test::to_host(str_col.chars()))); - } - CUDF_TEST_EXPECT_TABLES_EQUIVALENT(in, *new_cols); } } diff --git a/dependencies.yaml b/dependencies.yaml index 719794e6a19..cd9591b89e2 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -534,16 +534,19 @@ dependencies: - {matrix: null, packages: *run_cudf_packages_all_cu11} - output_types: conda matrices: + - matrix: {cuda: "12.*"} + packages: + - pynvjitlink - matrix: {cuda: "11.*"} packages: - cubinlinker - ptxcompiler - - {matrix: null, packages: null} - output_types: [requirements, pyproject] matrices: - matrix: {cuda: "12.*"} packages: - rmm-cu12==24.2.* + - pynvjitlink-cu12 - matrix: {cuda: "11.*"} packages: - rmm-cu11==24.2.* @@ -620,7 +623,7 @@ dependencies: - fastavro>=0.22.9 - hypothesis - pytest-benchmark - - pytest-cases<3.8.2 + - pytest-cases>=3.8.2 - python-snappy>=0.6.0 - scipy - output_types: conda diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index 6bcc9f24e1c..20e8e8e3c68 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -109,12 +109,12 @@ def clean_definitions(root): # All of these in type declarations cause Breathe to choke. # For friend, see https://github.com/breathe-doc/breathe/issues/916 strings_to_remove = ("__forceinline__", "CUDF_HOST_DEVICE", "decltype(auto)", "friend") - for field in (".//type", ".//definition"): - for type_ in root.findall(field): - if type_.text is not None: - for string in strings_to_remove: - type_.text = type_.text.replace(string, "") - + for node in root.iter(): + for string in strings_to_remove: + if node.text is not None: + node.text = node.text.replace(string, "") + if node.tail is not None: + node.tail = node.tail.replace(string, "") def clean_all_xml_files(path): for fn in glob.glob(os.path.join(path, "*.xml")): diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py index f908a995c2a..59d73015962 100644 --- a/python/cudf/benchmarks/API/bench_dataframe.py +++ b/python/cudf/benchmarks/API/bench_dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Benchmarks of DataFrame methods.""" @@ -178,6 +178,8 @@ def bench_nsmallest(benchmark, dataframe, num_cols_to_sort, n): benchmark(dataframe.nsmallest, n, by) -@pytest_cases.parametrize_with_cases("dataframe, cond, other", prefix="where") +@pytest_cases.parametrize_with_cases( + "dataframe, cond, other", prefix="where", cases="cases_dataframe" +) def bench_where(benchmark, dataframe, cond, other): benchmark(dataframe.where, cond, other) diff --git a/python/cudf/benchmarks/API/bench_functions.py b/python/cudf/benchmarks/API/bench_functions.py index ec4be221d9f..93109838900 100644 --- a/python/cudf/benchmarks/API/bench_functions.py +++ b/python/cudf/benchmarks/API/bench_functions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Benchmarks of free functions that accept cudf objects.""" @@ -9,7 +9,9 @@ from utils import benchmark_with_object -@pytest_cases.parametrize_with_cases("objs", prefix="concat") +@pytest_cases.parametrize_with_cases( + "objs", prefix="concat", cases="cases_functions" +) @pytest.mark.parametrize( "axis", [ diff --git a/python/cudf/benchmarks/API/bench_dataframe_cases.py b/python/cudf/benchmarks/API/cases_dataframe.py similarity index 88% rename from python/cudf/benchmarks/API/bench_dataframe_cases.py rename to python/cudf/benchmarks/API/cases_dataframe.py index fc41d141c8a..d12b9776f1b 100644 --- a/python/cudf/benchmarks/API/bench_dataframe_cases.py +++ b/python/cudf/benchmarks/API/cases_dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. from utils import benchmark_with_object diff --git a/python/cudf/benchmarks/API/bench_functions_cases.py b/python/cudf/benchmarks/API/cases_functions.py similarity index 99% rename from python/cudf/benchmarks/API/bench_functions_cases.py rename to python/cudf/benchmarks/API/cases_functions.py index c81f8f20f80..6bc66aa4a9b 100644 --- a/python/cudf/benchmarks/API/bench_functions_cases.py +++ b/python/cudf/benchmarks/API/cases_functions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. """Test cases for benchmarks in bench_functions.py.""" diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index acd0ba519dd..45aa1081b8d 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -5,13 +5,13 @@ from typing import Literal import cupy as cp import numpy as np +import pandas as pd import rmm import cudf import cudf._lib as libcudf from cudf._lib import pylibcudf -from cudf.api.types import is_categorical_dtype, is_datetime64tz_dtype from cudf.core.buffer import ( Buffer, ExposureTrackedBuffer, @@ -344,10 +344,10 @@ cdef class Column: ) cdef mutable_column_view mutable_view(self) except *: - if is_categorical_dtype(self.dtype): + if isinstance(self.dtype, cudf.CategoricalDtype): col = self.base_children[0] data_dtype = col.dtype - elif is_datetime64tz_dtype(self.dtype): + elif isinstance(self.dtype, pd.DatetimeTZDtype): col = self data_dtype = _get_base_dtype(col.dtype) else: @@ -407,10 +407,10 @@ cdef class Column: return self._view(c_null_count) cdef column_view _view(self, libcudf_types.size_type null_count) except *: - if is_categorical_dtype(self.dtype): + if isinstance(self.dtype, cudf.CategoricalDtype): col = self.base_children[0] data_dtype = col.dtype - elif is_datetime64tz_dtype(self.dtype): + elif isinstance(self.dtype, pd.DatetimeTZDtype): col = self data_dtype = _get_base_dtype(col.dtype) else: @@ -482,7 +482,7 @@ cdef class Column: # categoricals because cudf supports ordered and unordered categoricals # while libcudf supports only unordered categoricals (see # https://github.com/rapidsai/cudf/pull/8567). - if is_categorical_dtype(self.dtype): + if isinstance(self.dtype, cudf.CategoricalDtype): col = self.base_children[0] else: col = self @@ -648,7 +648,7 @@ cdef class Column: """ column_owner = isinstance(owner, Column) mask_owner = owner - if column_owner and is_categorical_dtype(owner.dtype): + if column_owner and isinstance(owner.dtype, cudf.CategoricalDtype): owner = owner.base_children[0] size = cv.size() diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index f332fead8d1..8848649736b 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -1,16 +1,17 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from functools import singledispatch from pandas.core.groupby.groupby import DataError -from cudf.api.types import ( - is_categorical_dtype, - is_decimal_dtype, - is_interval_dtype, - is_list_dtype, - is_string_dtype, - is_struct_dtype, -) +from cudf.api.types import is_string_dtype from cudf.core.buffer import acquire_spill_lock +from cudf.core.dtypes import ( + CategoricalDtype, + DecimalDtype, + IntervalDtype, + ListDtype, + StructDtype, +) from libcpp cimport bool from libcpp.memory cimport unique_ptr @@ -73,6 +74,43 @@ _DECIMAL_AGGS = { ctypedef const scalar constscalar +@singledispatch +def get_valid_aggregation(dtype): + if is_string_dtype(dtype): + return _STRING_AGGS + return "ALL" + + +@get_valid_aggregation.register +def _(dtype: ListDtype): + return _LIST_AGGS + + +@get_valid_aggregation.register +def _(dtype: CategoricalDtype): + return _CATEGORICAL_AGGS + + +@get_valid_aggregation.register +def _(dtype: ListDtype): + return _LIST_AGGS + + +@get_valid_aggregation.register +def _(dtype: StructDtype): + return _STRUCT_AGGS + + +@get_valid_aggregation.register +def _(dtype: IntervalDtype): + return _INTERVAL_AGGS + + +@get_valid_aggregation.register +def _(dtype: DecimalDtype): + return _DECIMAL_AGGS + + cdef _agg_result_from_columns( vector[libcudf_groupby.aggregation_result]& c_result_columns, set column_included, @@ -187,15 +225,7 @@ cdef class GroupBy: for i, (col, aggs) in enumerate(zip(values, aggregations)): dtype = col.dtype - valid_aggregations = ( - _LIST_AGGS if is_list_dtype(dtype) - else _STRING_AGGS if is_string_dtype(dtype) - else _CATEGORICAL_AGGS if is_categorical_dtype(dtype) - else _STRUCT_AGGS if is_struct_dtype(dtype) - else _INTERVAL_AGGS if is_interval_dtype(dtype) - else _DECIMAL_AGGS if is_decimal_dtype(dtype) - else "ALL" - ) + valid_aggregations = get_valid_aggregation(dtype) included_aggregations_i = [] c_agg_request = move(libcudf_groupby.aggregation_request()) @@ -258,15 +288,7 @@ cdef class GroupBy: for i, (col, aggs) in enumerate(zip(values, aggregations)): dtype = col.dtype - valid_aggregations = ( - _LIST_AGGS if is_list_dtype(dtype) - else _STRING_AGGS if is_string_dtype(dtype) - else _CATEGORICAL_AGGS if is_categorical_dtype(dtype) - else _STRUCT_AGGS if is_struct_dtype(dtype) - else _INTERVAL_AGGS if is_interval_dtype(dtype) - else _DECIMAL_AGGS if is_decimal_dtype(dtype) - else "ALL" - ) + valid_aggregations = get_valid_aggregation(dtype) included_aggregations_i = [] c_agg_request = move(libcudf_groupby.scan_request()) diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx index 8fd2a409d90..13c8ce43ea3 100644 --- a/python/cudf/cudf/_lib/interop.pyx +++ b/python/cudf/cudf/_lib/interop.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cpython cimport pycapsule from libcpp.memory cimport shared_ptr, unique_ptr @@ -18,8 +18,8 @@ from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns -from cudf.api.types import is_list_dtype, is_struct_dtype from cudf.core.buffer import acquire_spill_lock +from cudf.core.dtypes import ListDtype, StructDtype def from_dlpack(dlpack_capsule): @@ -98,7 +98,7 @@ cdef vector[column_metadata] gather_metadata(object cols_dtypes) except *: if cols_dtypes is not None: for idx, (col_name, col_dtype) in enumerate(cols_dtypes): cpp_metadata.push_back(column_metadata(col_name.encode())) - if is_struct_dtype(col_dtype) or is_list_dtype(col_dtype): + if isinstance(col_dtype, (ListDtype, StructDtype)): _set_col_children_metadata(col_dtype, cpp_metadata[idx]) else: raise TypeError( @@ -113,14 +113,14 @@ cdef _set_col_children_metadata(dtype, cdef column_metadata element_metadata - if is_struct_dtype(dtype): + if isinstance(dtype, StructDtype): for name, value in dtype.fields.items(): element_metadata = column_metadata(name.encode()) _set_col_children_metadata( value, element_metadata ) col_meta.children_meta.push_back(element_metadata) - elif is_list_dtype(dtype): + elif isinstance(dtype, ListDtype): col_meta.children_meta.reserve(2) # Offsets - child 0 col_meta.children_meta.push_back(column_metadata()) diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index 9b027a4d275..ae978d18813 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from cpython.buffer cimport PyBUF_READ from cpython.memoryview cimport PyMemoryView_FromMemory @@ -23,7 +23,7 @@ import errno import io import os -from cudf.api.types import is_struct_dtype +from cudf.core.dtypes import StructDtype # Converts the Python source input to libcudf IO source_info @@ -172,7 +172,7 @@ cdef Column update_column_struct_field_names( ) col.set_base_children(tuple(children)) - if is_struct_dtype(col): + if isinstance(col.dtype, StructDtype): field_names.reserve(len(col.base_children)) for i in range(info.children.size()): field_names.push_back(info.children[i].name) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 437c3ef6ec4..c361a3f00c4 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # cython: boundscheck = False @@ -17,6 +17,7 @@ from libcpp.utility cimport move from libcpp.vector cimport vector cimport cudf._lib.cpp.io.types as cudf_io_types +from cudf._lib.column cimport Column from cudf._lib.cpp.io.data_sink cimport data_sink from cudf._lib.cpp.io.json cimport ( json_reader_options, @@ -42,10 +43,6 @@ from cudf._lib.io.utils cimport ( from cudf._lib.types cimport dtype_to_data_type from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table -from cudf.api.types import is_list_dtype, is_struct_dtype - -from cudf._lib.column cimport Column - cpdef read_json(object filepaths_or_buffers, object dtype, @@ -214,13 +211,12 @@ def write_json( cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *: cdef schema_element s_element cdef data_type lib_type - if cudf.api.types.is_categorical_dtype(dtype): + dtype = cudf.dtype(dtype) + if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( "CategoricalDtype as dtype is not yet " "supported in JSON reader" ) - - dtype = cudf.dtype(dtype) lib_type = dtype_to_data_type(dtype) s_element.type = lib_type if isinstance(dtype, cudf.StructDtype): @@ -237,19 +233,18 @@ cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *: cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: - if cudf.api.types.is_categorical_dtype(dtype): + dtype = cudf.dtype(dtype) + if isinstance(dtype, cudf.CategoricalDtype): raise NotImplementedError( "CategoricalDtype as dtype is not yet " "supported in JSON reader" ) - - dtype = cudf.dtype(dtype) return dtype_to_data_type(dtype) cdef _set_col_children_metadata(Column col, column_name_info& col_meta): cdef column_name_info child_info - if is_struct_dtype(col): + if isinstance(col.dtype, cudf.StructDtype): for i, (child_col, name) in enumerate( zip(col.children, list(col.dtype.fields)) ): @@ -258,7 +253,7 @@ cdef _set_col_children_metadata(Column col, _set_col_children_metadata( child_col, col_meta.children[i] ) - elif is_list_dtype(col): + elif isinstance(col.dtype, cudf.ListDtype): for i, child_col in enumerate(col.children): col_meta.children.push_back(child_info) _set_col_children_metadata( diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 49d93402c82..2cbdf76030b 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -59,7 +59,6 @@ from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table from pyarrow.lib import NativeFile from cudf._lib.utils import _index_level_name, generate_pandas_metadata -from cudf.api.types import is_list_dtype, is_struct_dtype cpdef read_raw_orc_statistics(filepath_or_buffer): @@ -489,7 +488,7 @@ cdef class ORCWriter: cdef _set_col_children_metadata(Column col, column_in_metadata& col_meta, list_column_as_map=False): - if is_struct_dtype(col): + if isinstance(col.dtype, cudf.StructDtype): for i, (child_col, name) in enumerate( zip(col.children, list(col.dtype.fields)) ): @@ -497,7 +496,7 @@ cdef _set_col_children_metadata(Column col, _set_col_children_metadata( child_col, col_meta.child(i), list_column_as_map ) - elif is_list_dtype(col): + elif isinstance(col.dtype, cudf.ListDtype): if list_column_as_map: col_meta.set_list_column_as_map() _set_col_children_metadata( diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 4acb1ce10b1..27efc5e1ecd 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # cython: boundscheck = False @@ -18,12 +18,7 @@ import numpy as np from cython.operator cimport dereference -from cudf.api.types import ( - is_decimal_dtype, - is_list_dtype, - is_list_like, - is_struct_dtype, -) +from cudf.api.types import is_list_like from cudf._lib.utils cimport data_from_unique_ptr @@ -220,7 +215,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, # update the decimal precision of each column for col in names: - if is_decimal_dtype(df._data[col].dtype): + if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype): df._data[col].dtype.precision = ( meta_data_per_column[col]["metadata"]["precision"] ) @@ -703,7 +698,7 @@ cdef _set_col_metadata( # is true. col_meta.set_nullability(True) - if is_struct_dtype(col): + if isinstance(col.dtype, cudf.StructDtype): for i, (child_col, name) in enumerate( zip(col.children, list(col.dtype.fields)) ): @@ -713,13 +708,11 @@ cdef _set_col_metadata( col_meta.child(i), force_nullable_schema ) - elif is_list_dtype(col): + elif isinstance(col.dtype, cudf.ListDtype): _set_col_metadata( col.children[1], col_meta.child(1), force_nullable_schema ) - else: - if is_decimal_dtype(col): - col_meta.set_decimal_precision(col.dtype.precision) - return + elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype): + col_meta.set_decimal_precision(col.dtype.precision) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 27fb9e994f0..37708a4e3ba 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. import copy @@ -14,12 +14,7 @@ from libcpp.utility cimport move import cudf from cudf._lib import pylibcudf from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf.core.dtypes import ( - ListDtype, - StructDtype, - is_list_dtype, - is_struct_dtype, -) +from cudf.core.dtypes import ListDtype, StructDtype from cudf.core.missing import NA, NaT cimport cudf._lib.cpp.types as libcudf_types @@ -79,9 +74,9 @@ def gather_metadata(dtypes): out = [] for name, dtype in dtypes.items(): v = pylibcudf.interop.ColumnMetadata(name) - if is_struct_dtype(dtype): + if isinstance(dtype, cudf.StructDtype): v.children_meta = gather_metadata(dtype.fields) - elif is_list_dtype(dtype): + elif isinstance(dtype, cudf.ListDtype): # Offsets column is unnamed and has no children v.children_meta.append(pylibcudf.interop.ColumnMetadata("")) v.children_meta.extend( diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index d87104bf168..1b4f4617e97 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. from enum import IntEnum @@ -238,15 +238,15 @@ cdef dtype_from_column_view(column_view cv): cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: cdef libcudf_types.type_id tid - if cudf.api.types.is_list_dtype(dtype): + if isinstance(dtype, cudf.ListDtype): tid = libcudf_types.type_id.LIST - elif cudf.api.types.is_struct_dtype(dtype): + elif isinstance(dtype, cudf.StructDtype): tid = libcudf_types.type_id.STRUCT - elif cudf.api.types.is_decimal128_dtype(dtype): + elif isinstance(dtype, cudf.Decimal128Dtype): tid = libcudf_types.type_id.DECIMAL128 - elif cudf.api.types.is_decimal64_dtype(dtype): + elif isinstance(dtype, cudf.Decimal64Dtype): tid = libcudf_types.type_id.DECIMAL64 - elif cudf.api.types.is_decimal32_dtype(dtype): + elif isinstance(dtype, cudf.Decimal32Dtype): tid = libcudf_types.type_id.DECIMAL32 else: tid = ( @@ -259,21 +259,21 @@ cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: return libcudf_types.data_type(tid) cpdef dtype_to_pylibcudf_type(dtype): - if cudf.api.types.is_list_dtype(dtype): + if isinstance(dtype, cudf.ListDtype): return pylibcudf.DataType(pylibcudf.TypeId.LIST) - elif cudf.api.types.is_struct_dtype(dtype): + elif isinstance(dtype, cudf.StructDtype): return pylibcudf.DataType(pylibcudf.TypeId.STRUCT) - elif cudf.api.types.is_decimal_dtype(dtype): - if cudf.api.types.is_decimal128_dtype(dtype): - tid = pylibcudf.TypeId.DECIMAL128 - elif cudf.api.types.is_decimal64_dtype(dtype): - tid = pylibcudf.TypeId.DECIMAL64 - else: - tid = pylibcudf.TypeId.DECIMAL32 + elif isinstance(dtype, cudf.Decimal128Dtype): + tid = pylibcudf.TypeId.DECIMAL128 + return pylibcudf.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal64Dtype): + tid = pylibcudf.TypeId.DECIMAL64 + return pylibcudf.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal32Dtype): + tid = pylibcudf.TypeId.DECIMAL32 return pylibcudf.DataType(tid, -dtype.scale) - # libcudf types don't support localization so convert to the base type - if isinstance(dtype, pd.DatetimeTZDtype): + elif isinstance(dtype, pd.DatetimeTZDtype): dtype = np.dtype(f"=3.8.2", "pytest-cov", "pytest-xdist", "python-snappy>=0.6.0",