From 0cac2a9d68341a38721be16132ead14cf4a0d70b Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Mon, 22 Jul 2024 14:18:21 -0700 Subject: [PATCH 01/15] Remove size constraints on source files in batched JSON reading (#16162) Addresses https://github.com/rapidsai/cudf/issues/16138 The batched multi-source JSON reader fails when the size of any of the input source buffers exceeds `INT_MAX` bytes. The goal of this PR is to remove this constraint by modifying the batching behavior of the reader. Instead of constructing batches that include entire source files, the batches are now constructed at the granularity of byte ranges of size at most `INT_MAX` bytes, Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/16162 --- cpp/include/cudf/io/json.hpp | 4 +- cpp/src/io/json/read_json.cu | 139 +++++++++--------- cpp/src/io/json/read_json.hpp | 18 ++- cpp/tests/CMakeLists.txt | 14 +- .../json_chunked_reader.cu} | 81 ++-------- .../json_quote_normalization_test.cpp | 0 cpp/tests/io/{ => json}/json_test.cpp | 0 cpp/tests/io/{ => json}/json_tree.cpp | 0 .../io/{ => json}/json_type_cast_test.cu | 0 cpp/tests/io/json/json_utils.cuh | 105 +++++++++++++ .../json_whitespace_normalization_test.cu | 0 cpp/tests/io/{ => json}/json_writer.cpp | 0 cpp/tests/io/{ => json}/nested_json_test.cpp | 0 .../{json_tests.cpp => json_tests.cu} | 45 +++++- 14 files changed, 242 insertions(+), 164 deletions(-) rename cpp/tests/io/{json_chunked_reader.cpp => json/json_chunked_reader.cu} (64%) rename cpp/tests/io/{ => json}/json_quote_normalization_test.cpp (100%) rename cpp/tests/io/{ => json}/json_test.cpp (100%) rename cpp/tests/io/{ => json}/json_tree.cpp (100%) rename cpp/tests/io/{ => json}/json_type_cast_test.cu (100%) create mode 100644 cpp/tests/io/json/json_utils.cuh rename cpp/tests/io/{ => json}/json_whitespace_normalization_test.cu (100%) rename cpp/tests/io/{ => json}/json_writer.cpp (100%) rename cpp/tests/io/{ => json}/nested_json_test.cpp (100%) rename cpp/tests/large_strings/{json_tests.cpp => json_tests.cu} (50%) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 7af90766ad0..d47266fdd12 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -333,14 +333,14 @@ class json_reader_options { * * @param offset Number of bytes of offset */ - void set_byte_range_offset(size_type offset) { _byte_range_offset = offset; } + void set_byte_range_offset(size_t offset) { _byte_range_offset = offset; } /** * @brief Set number of bytes to read. * * @param size Number of bytes to read */ - void set_byte_range_size(size_type size) { _byte_range_size = size; } + void set_byte_range_size(size_t size) { _byte_range_size = size; } /** * @brief Set delimiter separating records in JSON lines diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 9cd39038348..0ba4dedfc34 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -148,20 +148,12 @@ device_span ingest_raw_input(device_span buffer, return buffer.first(uncomp_data.size()); } -size_type find_first_delimiter_in_chunk(host_span> sources, - json_reader_options const& reader_opts, - char const delimiter, - rmm::cuda_stream_view stream) +size_t estimate_size_per_subchunk(size_t chunk_size) { - auto total_source_size = sources_size(sources, 0, 0) + (sources.size() - 1); - rmm::device_uvector buffer(total_source_size, stream); - auto readbufspan = ingest_raw_input(buffer, - sources, - reader_opts.get_compression(), - reader_opts.get_byte_range_offset(), - reader_opts.get_byte_range_size(), - stream); - return find_first_delimiter(readbufspan, '\n', stream); + auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); }; + // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to + // 10kb) and the byte range size + return geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size); } /** @@ -183,7 +175,6 @@ datasource::owning_buffer> get_record_range_raw_input( rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); - auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); }; size_t const total_source_size = sources_size(sources, 0, 0); auto constexpr num_delimiter_chars = 1; @@ -198,17 +189,8 @@ datasource::owning_buffer> get_record_range_raw_input( auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset; chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size; - // Some magic numbers - constexpr int num_subchunks = 10; // per chunk_size - constexpr size_t min_subchunk_size = 10000; - int const num_subchunks_prealloced = should_load_all_sources ? 0 : 3; - constexpr int estimated_compression_ratio = 4; - - // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to - // 10kb) and the byte range size - - size_t const size_per_subchunk = - geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size); + int const num_subchunks_prealloced = should_load_all_sources ? 0 : max_subchunks_prealloced; + size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size); // The allocation for single source compressed input is estimated by assuming a ~4:1 // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea @@ -308,67 +290,78 @@ table_with_metadata read_json(host_span> sources, "Multiple inputs are supported only for JSON Lines format"); } - std::for_each(sources.begin(), sources.end(), [](auto const& source) { - CUDF_EXPECTS(source->size() < std::numeric_limits::max(), - "The size of each source file must be less than INT_MAX bytes"); - }); - - constexpr size_t batch_size_ub = std::numeric_limits::max(); - size_t const chunk_offset = reader_opts.get_byte_range_offset(); + /* + * The batched JSON reader enforces that the size of each batch is at most INT_MAX + * bytes (~2.14GB). Batches are defined to be byte range chunks - characterized by + * chunk offset and chunk size - that may span across multiple source files. + * Note that the batched reader does not work for compressed inputs or for regular + * JSON inputs. + */ + size_t const total_source_size = sources_size(sources, 0, 0); + size_t chunk_offset = reader_opts.get_byte_range_offset(); size_t chunk_size = reader_opts.get_byte_range_size(); - chunk_size = !chunk_size ? sources_size(sources, 0, 0) : chunk_size; - - // Identify the position of starting source file from which to begin batching based on - // byte range offset. If the offset is larger than the sum of all source - // sizes, then start_source is total number of source files i.e. no file is read - size_t const start_source = [&]() { - size_t sum = 0; + chunk_size = !chunk_size ? total_source_size - chunk_offset + : std::min(chunk_size, total_source_size - chunk_offset); + + size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size); + size_t const batch_size_ub = + std::numeric_limits::max() - (max_subchunks_prealloced * size_per_subchunk); + + /* + * Identify the position (zero-indexed) of starting source file from which to begin + * batching based on byte range offset. If the offset is larger than the sum of all + * source sizes, then start_source is total number of source files i.e. no file is + * read + */ + + // Prefix sum of source file sizes + size_t pref_source_size = 0; + // Starting source file from which to being batching evaluated using byte range offset + size_t const start_source = [chunk_offset, &sources, &pref_source_size]() { for (size_t src_idx = 0; src_idx < sources.size(); ++src_idx) { - if (sum + sources[src_idx]->size() > chunk_offset) return src_idx; - sum += sources[src_idx]->size(); + if (pref_source_size + sources[src_idx]->size() > chunk_offset) { return src_idx; } + pref_source_size += sources[src_idx]->size(); } return sources.size(); }(); - - // Construct batches of source files, with starting position of batches indicated by - // batch_positions. The size of each batch i.e. the sum of sizes of the source files in the batch - // is capped at INT_MAX bytes. - size_t cur_size = 0; - std::vector batch_positions; - std::vector batch_sizes; - batch_positions.push_back(0); - for (size_t i = start_source; i < sources.size(); i++) { - cur_size += sources[i]->size(); - if (cur_size >= batch_size_ub) { - batch_positions.push_back(i); - batch_sizes.push_back(cur_size - sources[i]->size()); - cur_size = sources[i]->size(); + /* + * Construct batches of byte ranges spanning source files, with the starting position of batches + * indicated by `batch_offsets`. `pref_bytes_size` gives the bytes position from which the current + * batch begins, and `end_bytes_size` gives the terminal bytes position after which reading + * stops. + */ + size_t pref_bytes_size = chunk_offset; + size_t end_bytes_size = chunk_offset + chunk_size; + std::vector batch_offsets{pref_bytes_size}; + for (size_t i = start_source; i < sources.size() && pref_bytes_size < end_bytes_size;) { + pref_source_size += sources[i]->size(); + // If the current source file can subsume multiple batches, we split the file until the + // boundary of the last batch exceeds the end of the file (indexed by `pref_source_size`) + while (pref_bytes_size < end_bytes_size && + pref_source_size >= std::min(pref_bytes_size + batch_size_ub, end_bytes_size)) { + auto next_batch_size = std::min(batch_size_ub, end_bytes_size - pref_bytes_size); + batch_offsets.push_back(batch_offsets.back() + next_batch_size); + pref_bytes_size += next_batch_size; } + i++; } - batch_positions.push_back(sources.size()); - batch_sizes.push_back(cur_size); - - // If there is a single batch, then we can directly return the table without the - // unnecessary concatenate - if (batch_sizes.size() == 1) return read_batch(sources, reader_opts, stream, mr); + /* + * If there is a single batch, then we can directly return the table without the + * unnecessary concatenate. The size of batch_offsets is 1 if all sources are empty, + * or if end_bytes_size is larger than total_source_size. + */ + if (batch_offsets.size() <= 2) return read_batch(sources, reader_opts, stream, mr); std::vector partial_tables; json_reader_options batched_reader_opts{reader_opts}; - // Dispatch individual batches to read_batch and push the resulting table into // partial_tables array. Note that the reader options need to be updated for each // batch to adjust byte range offset and byte range size. - for (size_t i = 0; i < batch_sizes.size(); i++) { - batched_reader_opts.set_byte_range_size(std::min(batch_sizes[i], chunk_size)); - partial_tables.emplace_back(read_batch( - host_span>(sources.begin() + batch_positions[i], - batch_positions[i + 1] - batch_positions[i]), - batched_reader_opts, - stream, - rmm::mr::get_current_device_resource())); - if (chunk_size <= batch_sizes[i]) break; - chunk_size -= batch_sizes[i]; - batched_reader_opts.set_byte_range_offset(0); + for (size_t i = 0; i < batch_offsets.size() - 1; i++) { + batched_reader_opts.set_byte_range_offset(batch_offsets[i]); + batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]); + partial_tables.emplace_back( + read_batch(sources, batched_reader_opts, stream, rmm::mr::get_current_device_resource())); } auto expects_schema_equality = diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp index 0c30b4cad46..ff69f9b7627 100644 --- a/cpp/src/io/json/read_json.hpp +++ b/cpp/src/io/json/read_json.hpp @@ -29,6 +29,19 @@ namespace cudf::io::json::detail { +// Some magic numbers +constexpr int num_subchunks = 10; // per chunk_size +constexpr size_t min_subchunk_size = 10000; +constexpr int estimated_compression_ratio = 4; +constexpr int max_subchunks_prealloced = 3; + +device_span ingest_raw_input(device_span buffer, + host_span> sources, + compression_type compression, + size_t range_offset, + size_t range_size, + rmm::cuda_stream_view stream); + table_with_metadata read_json(host_span> sources, json_reader_options const& reader_opts, rmm::cuda_stream_view stream, @@ -38,9 +51,4 @@ size_type find_first_delimiter(device_span d_data, char const delimiter, rmm::cuda_stream_view stream); -size_type find_first_delimiter_in_chunk(host_span> sources, - json_reader_options const& reader_opts, - char const delimiter, - rmm::cuda_stream_view stream); - } // namespace cudf::io::json::detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 8e2017ccb97..05e9759632f 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -313,17 +313,17 @@ ConfigureTest( PERCENT 30 ) ConfigureTest( - JSON_TEST io/json_test.cpp io/json_chunked_reader.cpp + JSON_TEST io/json/json_test.cpp io/json/json_chunked_reader.cu GPUS 1 PERCENT 30 ) -ConfigureTest(JSON_WRITER_TEST io/json_writer.cpp) -ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu) -ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp) +ConfigureTest(JSON_WRITER_TEST io/json/json_writer.cpp) +ConfigureTest(JSON_TYPE_CAST_TEST io/json/json_type_cast_test.cu) +ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cpp) ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp) ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) -ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp) -ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json_whitespace_normalization_test.cu) +ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp) +ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu) ConfigureTest( DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp GPUS 1 @@ -572,7 +572,7 @@ ConfigureTest( LARGE_STRINGS_TEST large_strings/concatenate_tests.cpp large_strings/case_tests.cpp - large_strings/json_tests.cpp + large_strings/json_tests.cu large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp large_strings/parquet_tests.cpp diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json/json_chunked_reader.cu similarity index 64% rename from cpp/tests/io/json_chunked_reader.cpp rename to cpp/tests/io/json/json_chunked_reader.cu index 23d54f7263c..b9dee54752c 100644 --- a/cpp/tests/io/json_chunked_reader.cpp +++ b/cpp/tests/io/json/json_chunked_reader.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "io/json/read_json.hpp" +#include "json_utils.cuh" #include #include @@ -37,65 +37,6 @@ cudf::test::TempDirTestEnvironment* const temp_env = static_cast( ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); -// function to extract first delimiter in the string in each chunk, -// collate together and form byte_range for each chunk, -// parse separately. -std::vector skeleton_for_parellel_chunk_reader( - cudf::host_span> sources, - cudf::io::json_reader_options const& reader_opts, - int32_t chunk_size, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - using namespace cudf::io::json::detail; - using cudf::size_type; - size_t total_source_size = 0; - for (auto const& source : sources) { - total_source_size += source->size(); - } - size_t num_chunks = (total_source_size + chunk_size - 1) / chunk_size; - constexpr size_type no_min_value = -1; - - // Get the first delimiter in each chunk. - std::vector first_delimiter_index(num_chunks); - auto reader_opts_chunk = reader_opts; - for (size_t i = 0; i < num_chunks; i++) { - auto const chunk_start = i * chunk_size; - reader_opts_chunk.set_byte_range_offset(chunk_start); - reader_opts_chunk.set_byte_range_size(chunk_size); - first_delimiter_index[i] = - find_first_delimiter_in_chunk(sources, reader_opts_chunk, '\n', stream); - if (first_delimiter_index[i] != no_min_value) { first_delimiter_index[i] += chunk_start; } - } - - // Process and allocate record start, end for each worker. - using record_range = std::pair; - std::vector record_ranges; - record_ranges.reserve(num_chunks); - first_delimiter_index[0] = 0; - auto prev = first_delimiter_index[0]; - for (size_t i = 1; i < num_chunks; i++) { - if (first_delimiter_index[i] == no_min_value) continue; - record_ranges.emplace_back(prev, first_delimiter_index[i]); - prev = first_delimiter_index[i]; - } - record_ranges.emplace_back(prev, total_source_size); - - std::vector tables; - // Process each chunk in parallel. - for (auto const& [chunk_start, chunk_end] : record_ranges) { - if (chunk_start == -1 or chunk_end == -1 or - static_cast(chunk_start) >= total_source_size) - continue; - reader_opts_chunk.set_byte_range_offset(chunk_start); - reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start); - tables.push_back(read_json(sources, reader_opts_chunk, stream, mr)); - } - // assume all records have same number of columns, and inferred same type. (or schema is passed) - // TODO a step before to merge all columns, types and infer final schema. - return tables; -} - TEST_F(JsonReaderTest, ByteRange_SingleSource) { std::string const json_string = R"( @@ -118,11 +59,11 @@ TEST_F(JsonReaderTest, ByteRange_SingleSource) // Test for different chunk sizes for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500}) { - auto const tables = skeleton_for_parellel_chunk_reader(datasources, - json_lines_options, - chunk_size, - cudf::get_default_stream(), - rmm::mr::get_current_device_resource()); + auto const tables = split_byte_range_reading(datasources, + json_lines_options, + chunk_size, + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); auto table_views = std::vector(tables.size()); std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) { @@ -213,11 +154,11 @@ TEST_F(JsonReaderTest, ByteRange_MultiSource) // Test for different chunk sizes for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500, 1000, 2000}) { - auto const tables = skeleton_for_parellel_chunk_reader(datasources, - json_lines_options, - chunk_size, - cudf::get_default_stream(), - rmm::mr::get_current_device_resource()); + auto const tables = split_byte_range_reading(datasources, + json_lines_options, + chunk_size, + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); auto table_views = std::vector(tables.size()); std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) { diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp similarity index 100% rename from cpp/tests/io/json_quote_normalization_test.cpp rename to cpp/tests/io/json/json_quote_normalization_test.cpp diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json/json_test.cpp similarity index 100% rename from cpp/tests/io/json_test.cpp rename to cpp/tests/io/json/json_test.cpp diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp similarity index 100% rename from cpp/tests/io/json_tree.cpp rename to cpp/tests/io/json/json_tree.cpp diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json/json_type_cast_test.cu similarity index 100% rename from cpp/tests/io/json_type_cast_test.cu rename to cpp/tests/io/json/json_type_cast_test.cu diff --git a/cpp/tests/io/json/json_utils.cuh b/cpp/tests/io/json/json_utils.cuh new file mode 100644 index 00000000000..9383797d91b --- /dev/null +++ b/cpp/tests/io/json/json_utils.cuh @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "io/json/read_json.hpp" + +#include +#include +#include +#include + +#include + +#include + +// Helper function to test correctness of JSON byte range reading. +// We split the input source files into a set of byte range chunks each of size +// `chunk_size` and return an array of partial tables constructed from each chunk +template +std::vector split_byte_range_reading( + cudf::host_span> sources, + cudf::io::json_reader_options const& reader_opts, + IndexType chunk_size, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto total_source_size = [&sources]() { + return std::accumulate(sources.begin(), sources.end(), 0ul, [=](size_t sum, auto& source) { + auto const size = source->size(); + return sum + size; + }); + }(); + auto find_first_delimiter_in_chunk = + [total_source_size, &sources, &stream]( + cudf::io::json_reader_options const& reader_opts) -> IndexType { + rmm::device_uvector buffer(total_source_size, stream); + auto readbufspan = cudf::io::json::detail::ingest_raw_input(buffer, + sources, + reader_opts.get_compression(), + reader_opts.get_byte_range_offset(), + reader_opts.get_byte_range_size(), + stream); + // Note: we cannot reuse cudf::io::json::detail::find_first_delimiter since the + // return type of that function is size_type. However, when the chunk_size is + // larger than INT_MAX, the position of the delimiter can also be larger than + // INT_MAX. We do not encounter this overflow error in the detail function + // since the batched JSON reader splits the byte_range_size into chunk_sizes + // smaller than INT_MAX bytes + auto const first_delimiter_position_it = + thrust::find(rmm::exec_policy(stream), readbufspan.begin(), readbufspan.end(), '\n'); + return first_delimiter_position_it != readbufspan.end() + ? thrust::distance(readbufspan.begin(), first_delimiter_position_it) + : -1; + }; + size_t num_chunks = (total_source_size + chunk_size - 1) / chunk_size; + constexpr IndexType no_min_value = -1; + + // Get the first delimiter in each chunk. + std::vector first_delimiter_index(num_chunks); + auto reader_opts_chunk = reader_opts; + for (size_t i = 0; i < num_chunks; i++) { + auto const chunk_start = i * chunk_size; + // We are updating reader_opt_chunks to store offset and size information for the current chunk + reader_opts_chunk.set_byte_range_offset(chunk_start); + reader_opts_chunk.set_byte_range_size(chunk_size); + first_delimiter_index[i] = find_first_delimiter_in_chunk(reader_opts_chunk); + } + + // Process and allocate record start, end for each worker. + using record_range = std::pair; + std::vector record_ranges; + record_ranges.reserve(num_chunks); + size_t prev = 0; + for (size_t i = 1; i < num_chunks; i++) { + // In the case where chunk_size is smaller than row size, the chunk needs to be skipped + if (first_delimiter_index[i] == no_min_value) continue; + size_t next = static_cast(first_delimiter_index[i]) + (i * chunk_size); + record_ranges.emplace_back(prev, next); + prev = next; + } + record_ranges.emplace_back(prev, total_source_size); + + std::vector tables; + for (auto const& [chunk_start, chunk_end] : record_ranges) { + reader_opts_chunk.set_byte_range_offset(chunk_start); + reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start); + tables.push_back(cudf::io::json::detail::read_json(sources, reader_opts_chunk, stream, mr)); + } + // assume all records have same number of columns, and inferred same type. (or schema is passed) + // TODO a step before to merge all columns, types and infer final schema. + return tables; +} diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu similarity index 100% rename from cpp/tests/io/json_whitespace_normalization_test.cu rename to cpp/tests/io/json/json_whitespace_normalization_test.cu diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp similarity index 100% rename from cpp/tests/io/json_writer.cpp rename to cpp/tests/io/json/json_writer.cpp diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp similarity index 100% rename from cpp/tests/io/nested_json_test.cpp rename to cpp/tests/io/json/nested_json_test.cpp diff --git a/cpp/tests/large_strings/json_tests.cpp b/cpp/tests/large_strings/json_tests.cu similarity index 50% rename from cpp/tests/large_strings/json_tests.cpp rename to cpp/tests/large_strings/json_tests.cu index bf16d131ba7..49abf7b484d 100644 --- a/cpp/tests/large_strings/json_tests.cpp +++ b/cpp/tests/large_strings/json_tests.cu @@ -14,8 +14,13 @@ * limitations under the License. */ +#include "../io/json/json_utils.cuh" #include "large_strings_fixture.hpp" +#include + +#include +#include #include #include @@ -28,31 +33,57 @@ TEST_F(JsonLargeReaderTest, MultiBatch) { "a": { "y" : 6}, "b" : [4, 5 ], "c": 12 } { "a": { "y" : 6}, "b" : [6 ], "c": 13 } { "a": { "y" : 6}, "b" : [7 ], "c": 14 })"; - constexpr size_t expected_file_size = std::numeric_limits::max() / 2; + constexpr size_t batch_size_ub = std::numeric_limits::max(); + constexpr size_t expected_file_size = 1.5 * static_cast(batch_size_ub); std::size_t const log_repetitions = static_cast(std::ceil(std::log2(expected_file_size / json_string.size()))); json_string.reserve(json_string.size() * (1UL << log_repetitions)); - std::size_t numrows = 4; for (std::size_t i = 0; i < log_repetitions; i++) { json_string += json_string; - numrows <<= 1; } constexpr int num_sources = 2; - std::vector> hostbufs( - num_sources, cudf::host_span(json_string.data(), json_string.size())); + std::vector> hostbufs( + num_sources, + cudf::host_span(reinterpret_cast(json_string.data()), + json_string.size())); // Initialize parsing options (reading json lines) cudf::io::json_reader_options json_lines_options = cudf::io::json_reader_options::builder( cudf::io::source_info{ - cudf::host_span>(hostbufs.data(), hostbufs.size())}) + cudf::host_span>(hostbufs.data(), hostbufs.size())}) .lines(true) .compression(cudf::io::compression_type::NONE) .recovery_mode(cudf::io::json_recovery_mode_t::FAIL); // Read full test data via existing, nested JSON lines reader cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options); - ASSERT_EQ(current_reader_table.tbl->num_rows(), numrows * num_sources); + + std::vector> datasources; + for (auto& hb : hostbufs) { + datasources.emplace_back(cudf::io::datasource::create(hb)); + } + // Test for different chunk sizes + std::vector chunk_sizes{ + batch_size_ub / 4, batch_size_ub / 2, batch_size_ub, static_cast(batch_size_ub * 2)}; + for (auto chunk_size : chunk_sizes) { + auto const tables = + split_byte_range_reading(datasources, + json_lines_options, + chunk_size, + cudf::get_default_stream(), + rmm::mr::get_current_device_resource()); + + auto table_views = std::vector(tables.size()); + std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) { + return table.tbl->view(); + }); + auto result = cudf::concatenate(table_views); + + // Verify that the data read via chunked reader matches the data read via nested JSON reader + // cannot use EQUAL due to concatenate removing null mask + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view()); + } } From c7b28ceeb46d2b921e30f081a9ed97745c91ff9e Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 23 Jul 2024 05:28:13 -0500 Subject: [PATCH 02/15] Add `drop_nulls` in `cudf-polars` (#16290) Closes https://github.com/rapidsai/cudf/issues/16219 Authors: - https://github.com/brandon-b-miller Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16290 --- python/cudf_polars/cudf_polars/dsl/expr.py | 30 +++++++++- python/cudf_polars/tests/test_drop_nulls.py | 65 +++++++++++++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 python/cudf_polars/tests/test_drop_nulls.py diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index a034d55120a..8322d6bd6fb 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -882,7 +882,14 @@ def __init__( self.name = name self.options = options self.children = children - if self.name not in ("mask_nans", "round", "setsorted", "unique"): + if self.name not in ( + "mask_nans", + "round", + "setsorted", + "unique", + "dropnull", + "fill_null", + ): raise NotImplementedError(f"Unary function {name=}") def do_evaluate( @@ -968,6 +975,27 @@ def do_evaluate( order=order, null_order=null_order, ) + elif self.name == "dropnull": + (column,) = ( + child.evaluate(df, context=context, mapping=mapping) + for child in self.children + ) + return Column( + plc.stream_compaction.drop_nulls( + plc.Table([column.obj]), [0], 1 + ).columns()[0] + ) + elif self.name == "fill_null": + column = self.children[0].evaluate(df, context=context, mapping=mapping) + if isinstance(self.children[1], Literal): + arg = plc.interop.from_arrow(self.children[1].value) + else: + evaluated = self.children[1].evaluate( + df, context=context, mapping=mapping + ) + arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj + return Column(plc.replace.replace_nulls(column.obj, arg)) + raise NotImplementedError( f"Unimplemented unary function {self.name=}" ) # pragma: no cover; init trips first diff --git a/python/cudf_polars/tests/test_drop_nulls.py b/python/cudf_polars/tests/test_drop_nulls.py new file mode 100644 index 00000000000..5dfe9f66a97 --- /dev/null +++ b/python/cudf_polars/tests/test_drop_nulls.py @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) + + +@pytest.fixture( + params=[ + [1, 2, 1, 3, 5, None, None], + [1.5, 2.5, None, 1.5, 3, float("nan"), 3], + [], + [None, None], + [1, 2, 3, 4, 5], + ] +) +def null_data(request): + is_empty = pl.Series(request.param).dtype == pl.Null + return pl.DataFrame( + { + "a": pl.Series(request.param, dtype=pl.Float64 if is_empty else None), + "b": pl.Series(request.param, dtype=pl.Float64 if is_empty else None), + } + ).lazy() + + +def test_drop_null(null_data): + q = null_data.select(pl.col("a").drop_nulls()) + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "value", + [0, pl.col("a").mean(), pl.col("b")], + ids=["scalar", "aggregation", "column_expression"], +) +def test_fill_null(null_data, value): + q = null_data.select(pl.col("a").fill_null(value)) + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize( + "strategy", ["forward", "backward", "min", "max", "mean", "zero", "one"] +) +def test_fill_null_with_strategy(null_data, strategy): + q = null_data.select(pl.col("a").fill_null(strategy=strategy)) + + # Not yet exposed to python from rust + assert_ir_translation_raises(q, NotImplementedError) + + +@pytest.mark.parametrize("strategy", ["forward", "backward"]) +@pytest.mark.parametrize("limit", [0, 1, 2]) +def test_fill_null_with_limit(null_data, strategy, limit): + q = null_data.select(pl.col("a").fill_null(strategy=strategy, limit=limit)) + + # Not yet exposed to python from rust + assert_ir_translation_raises(q, NotImplementedError) From e6d412cba7c23df7ee500c28257ed9281cea49b9 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 23 Jul 2024 06:03:28 -0500 Subject: [PATCH 03/15] Fall back when casting a timestamp to numeric in cudf-polars (#16232) This PR adds logic that falls back to CPU when a cudf-polars query would cast a timestamp column to a numeric type, an unsupported operation in libcudf, which should fix a few polars tests. It could be cleaned up a bit with some of the utilities that will be added in https://github.com/rapidsai/cudf/pull/16150. Authors: - https://github.com/brandon-b-miller Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16232 --- python/cudf_polars/cudf_polars/dsl/expr.py | 4 ++ .../tests/expressions/test_casting.py | 52 +++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 python/cudf_polars/tests/expressions/test_casting.py diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 8322d6bd6fb..9835e6f8461 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -1188,6 +1188,10 @@ class Cast(Expr): def __init__(self, dtype: plc.DataType, value: Expr) -> None: super().__init__(dtype) self.children = (value,) + if not plc.unary.is_supported_cast(self.dtype, value.dtype): + raise NotImplementedError( + f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}" + ) def do_evaluate( self, diff --git a/python/cudf_polars/tests/expressions/test_casting.py b/python/cudf_polars/tests/expressions/test_casting.py new file mode 100644 index 00000000000..3e003054338 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_casting.py @@ -0,0 +1,52 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) + +_supported_dtypes = [(pl.Int8(), pl.Int64())] + +_unsupported_dtypes = [ + (pl.String(), pl.Int64()), +] + + +@pytest.fixture +def dtypes(request): + return request.param + + +@pytest.fixture +def tests(dtypes): + fromtype, totype = dtypes + if fromtype == pl.String(): + data = ["a", "b", "c"] + else: + data = [1, 2, 3] + return pl.DataFrame( + { + "a": pl.Series(data, dtype=fromtype), + } + ).lazy(), totype + + +@pytest.mark.parametrize("dtypes", _supported_dtypes, indirect=True) +def test_cast_supported(tests): + df, totype = tests + q = df.select(pl.col("a").cast(totype)) + assert_gpu_result_equal(q) + + +@pytest.mark.parametrize("dtypes", _unsupported_dtypes, indirect=True) +def test_cast_unsupported(tests): + df, totype = tests + assert_ir_translation_raises( + df.select(pl.col("a").cast(totype)), NotImplementedError + ) From ff30c0211109e14b1f6918fcc6c2e2b98f863a1f Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Tue, 23 Jul 2024 12:03:55 -0700 Subject: [PATCH 04/15] Fix compile warnings with `jni_utils.hpp` (#16336) This fixes the compiler warnings with `jni_utils.hpp`, removing some `const` qualifiers that are redundant. Closes https://github.com/rapidsai/cudf/issues/16335. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/16336 --- java/src/main/native/include/jni_utils.hpp | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp index ea04c1cda83..a3b4bfcb63e 100644 --- a/java/src/main/native/include/jni_utils.hpp +++ b/java/src/main/native/include/jni_utils.hpp @@ -284,7 +284,7 @@ class native_jArray { return data()[index]; } - const N_TYPE* const data() const + N_TYPE const* data() const { init_data_ptr(); return data_ptr; @@ -296,17 +296,15 @@ class native_jArray { return data_ptr; } - const N_TYPE* const begin() const { return data(); } + N_TYPE const* begin() const { return data(); } N_TYPE* begin() { return data(); } - const N_TYPE* const end() const { return data() + size(); } + N_TYPE const* end() const { return data() + size(); } N_TYPE* end() { return data() + size(); } - const J_ARRAY_TYPE get_jArray() const { return orig; } - - J_ARRAY_TYPE get_jArray() { return orig; } + J_ARRAY_TYPE get_jArray() const { return orig; } /** * @brief Conversion to std::vector @@ -430,9 +428,7 @@ class native_jpointerArray { T* const* begin() const { return data(); } T* const* end() const { return data() + size(); } - const jlongArray get_jArray() const { return wrapped.get_jArray(); } - - jlongArray get_jArray() { return wrapped.get_jArray(); } + jlongArray get_jArray() const { return wrapped.get_jArray(); } void assert_no_nulls() const { @@ -624,7 +620,7 @@ class native_jstring { return true; } - const jstring get_jstring() const { return orig; } + jstring get_jstring() const { return orig; } ~native_jstring() { @@ -753,13 +749,13 @@ class native_jstringArray { return cache[index]; } - char const** const as_c_array() const + char const** as_c_array() const { init_c_cache(); return c_cache.data(); } - const std::vector as_cpp_vector() const + std::vector as_cpp_vector() const { init_cpp_cache(); return cpp_cache; From cd711913d2312ba158e34f5c03784a7b07f1583a Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Wed, 24 Jul 2024 00:24:19 +0200 Subject: [PATCH 05/15] Adds write-coalescing code path optimization to FST (#16143) This PR adds an optimized code path to the finite-state transducer (FST) that will use a shared memory-backed write buffer for the translated output and translated output indexes, if the the write buffer does not require allocating excessive amounts of shared memory (i.e., current heuristic is 24 KB/CTA). Writes are first buffered in shared memory and then collaboratively written out using coalesced writes to global memory. ## Benchmark results Numbers are for libcudf's FST_NVBENCH for a 1.073 GB input. FST outputs one token per input symbol. Benchmarks run on V100 with 900 GB/s theoretical peak BW. We compare the current FST implementation (old) to an FST implementaation that uses write-coalescing to gmem (new). | | OLD throughput (GB/s) | NEW throughput (GB/s) | relative performance | | 1st kernel, per byte: bytes read/written | 2nd kernel, per byte: bytes read/written | expected SOL (GB/s) | achieved SOL (old) | achieved SOL (new) | |------------------|------------------------|------------------------|----------------------|---|------------------------------------------|------------------------------------------|---------------------|--------------------|--------------------| | full | 15.7 | 74.74 | 476% | | 1 | 6 | 102.86 | 15.26% | 72.66% | | no out-indexes | 39.123 | 105.8 | 270% | | 1 | 2 | 240.00 | 16.30% | 44.08% | | no-output | 229.27 | 178.92 | 78% | | 1 | 1 | 360.00 | 63.69% | 49.70% | | out-indexes-only | 24.95 | 85.2 | 341% | | 1 | 5 | 120.00 | 20.79% | 71.00% | Authors: - Elias Stehle (https://github.com/elstehle) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/16143 --- cpp/benchmarks/io/fst.cu | 16 +- cpp/src/io/fst/agent_dfa.cuh | 371 ++++++++++++++++++++++---- cpp/src/io/fst/dispatch_dfa.cuh | 7 +- cpp/src/io/fst/lookup_tables.cuh | 70 +++-- cpp/src/io/json/json_normalization.cu | 26 +- cpp/src/io/json/nested_json_gpu.cu | 25 +- cpp/tests/io/fst/common.hpp | 4 +- cpp/tests/io/fst/fst_test.cu | 4 +- 8 files changed, 425 insertions(+), 98 deletions(-) diff --git a/cpp/benchmarks/io/fst.cu b/cpp/benchmarks/io/fst.cu index ad19bdfdfcb..31f1bf8e70f 100644 --- a/cpp/benchmarks/io/fst.cu +++ b/cpp/benchmarks/io/fst.cu @@ -95,7 +95,9 @@ void BM_FST_JSON(nvbench::state& state) auto parser = cudf::io::fst::detail::make_fst( cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), cudf::io::fst::detail::make_transition_table(pda_state_tt), - cudf::io::fst::detail::make_translation_table(pda_out_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); @@ -134,7 +136,9 @@ void BM_FST_JSON_no_outidx(nvbench::state& state) auto parser = cudf::io::fst::detail::make_fst( cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), cudf::io::fst::detail::make_transition_table(pda_state_tt), - cudf::io::fst::detail::make_translation_table(pda_out_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); @@ -171,7 +175,9 @@ void BM_FST_JSON_no_out(nvbench::state& state) auto parser = cudf::io::fst::detail::make_fst( cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), cudf::io::fst::detail::make_transition_table(pda_state_tt), - cudf::io::fst::detail::make_translation_table(pda_out_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); @@ -209,7 +215,9 @@ void BM_FST_JSON_no_str(nvbench::state& state) auto parser = cudf::io::fst::detail::make_fst( cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), cudf::io::fst::detail::make_transition_table(pda_state_tt), - cudf::io::fst::detail::make_translation_table(pda_out_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 2171764decd..bc5b94e2718 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -18,7 +18,9 @@ #include "in_reg_array.cuh" #include +#include #include +#include #include namespace cudf::io::fst::detail { @@ -44,9 +46,10 @@ using StateIndexT = uint32_t; template struct VectorCompositeOp { template - __host__ __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs) + __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs) { VectorT res{}; +#pragma unroll for (int32_t i = 0; i < NUM_ITEMS; ++i) { res.Set(i, rhs.Get(lhs.Get(i))); } @@ -57,61 +60,275 @@ struct VectorCompositeOp { /** * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed - * to emit any output symbol (the "transduced" output) and, if so, keeps track of how many symbols - * it intends to write out and writing out such symbols to the given output iterators. + * to emit any output symbol (the "transduced" output) and, if so, keeps track of *how many* symbols + * it intends to write out. + */ +template +class DFACountCallbackWrapper { + public: + __device__ __forceinline__ DFACountCallbackWrapper(TransducerTableT transducer_table) + : transducer_table(transducer_table) + { + } + + template + __device__ __forceinline__ void Init(OffsetT const&) + { + out_count = 0; + } + + template + __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol) + { + uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); + out_count += count; + } + + __device__ __forceinline__ void TearDown() {} + TransducerTableT const transducer_table; + uint32_t out_count{}; +}; + +/** + * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the + * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed + * to emit any output symbol (the "transduced" output) and, if so, writes out such symbols to the + * given output iterators. * + * @tparam MaxTranslatedOutChars The maximum number of symbols that are written on a any given state + * transition * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up * the symbols that are supposed to be emitted on a given state transition. - * @tparam TransducedOutItT A Random-access output iterator type to which symbols returned by the + * @tparam TransducedOutItT A random-access output iterator type to which symbols returned by the * transducer table are assignable. - * @tparam TransducedIndexOutItT A Random-access output iterator type to which indexes are written. + * @tparam TransducedIndexOutItT A random-access output iterator type to which indexes are written. */ -template -class DFASimulationCallbackWrapper { +template +class DFAWriteCallbackWrapper { public: - __host__ __device__ __forceinline__ DFASimulationCallbackWrapper( - TransducerTableT transducer_table, TransducedOutItT out_it, TransducedIndexOutItT out_idx_it) - : transducer_table(transducer_table), out_it(out_it), out_idx_it(out_idx_it), write(false) + __device__ __forceinline__ DFAWriteCallbackWrapper(TransducerTableT transducer_table, + TransducedOutItT out_it, + TransducedIndexOutItT out_idx_it, + uint32_t out_offset, + uint32_t /*tile_out_offset*/, + uint32_t /*tile_in_offset*/, + uint32_t /*tile_out_count*/) + : transducer_table(transducer_table), + out_it(out_it), + out_idx_it(out_idx_it), + out_offset(out_offset) { } template - __host__ __device__ __forceinline__ void Init(OffsetT const& offset) + __device__ __forceinline__ void Init(OffsetT const& in_offset) + { + this->in_offset = in_offset; + } + + template + __device__ __forceinline__ + typename ::cuda::std::enable_if<(MaxTranslatedOutChars_ <= 2), void>::type + ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol, + cub::Int2Type /*MaxTranslatedOutChars*/) + { + uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); + +#pragma unroll + for (uint32_t out_char = 0; out_char < MaxTranslatedOutChars_; out_char++) { + if (out_char < count) { + out_it[out_offset + out_char] = + transducer_table(old_state, symbol_id, out_char, read_symbol); + out_idx_it[out_offset + out_char] = in_offset + character_index; + } + } + out_offset += count; + } + + template + __device__ __forceinline__ + typename ::cuda::std::enable_if<(MaxTranslatedOutChars_ > 2), void>::type + ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol, + cub::Int2Type) { - this->offset = offset; - if (!write) out_count = 0; + uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); + + for (uint32_t out_char = 0; out_char < count; out_char++) { + out_it[out_offset + out_char] = transducer_table(old_state, symbol_id, out_char, read_symbol); + out_idx_it[out_offset + out_char] = in_offset + character_index; + } + out_offset += count; } template - __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, - StateIndexT const old_state, - StateIndexT const new_state, - SymbolIndexT const symbol_id, - SymbolT const read_symbol) + __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol) + { + ReadSymbol(character_index, + old_state, + new_state, + symbol_id, + read_symbol, + cub::Int2Type{}); + } + + __device__ __forceinline__ void TearDown() {} + + public: + TransducerTableT const transducer_table; + TransducedOutItT out_it; + TransducedIndexOutItT out_idx_it; + uint32_t out_offset; + uint32_t in_offset; +}; + +/** + * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the + * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed + * to emit any output symbol (the "transduced" output) and, if so, writes out such symbols to the + * given output iterators. This class uses a shared memory-backed write buffer to coalesce writes to + * global memory. + * + * @tparam DiscardIndexOutput Whether to discard the indexes instead of writing them to the given + * output iterator + * @tparam DiscardTranslatedOutput Whether to discard the translated output symbols instead of + * writing them to the given output iterator + * @tparam NumWriteBufferItems The number of items to allocate in shared memory for the write + * buffer. + * @tparam OutputT The type of the translated items + * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up + * the symbols that are supposed to be emitted on a given state transition. + * @tparam TransducedOutItT A random-access output iterator type to which symbols returned by the + * transducer table are assignable. + * @tparam TransducedIndexOutItT A random-access output iterator type to which indexes are written. + */ +template +class WriteCoalescingCallbackWrapper { + struct TempStorage_Offsets { + uint16_t compacted_offset[NumWriteBufferItems]; + }; + struct TempStorage_Symbols { + OutputT compacted_symbols[NumWriteBufferItems]; + }; + using offset_cache_t = + ::cuda::std::conditional_t; + using symbol_cache_t = ::cuda::std:: + conditional_t, TempStorage_Symbols>; + struct TempStorage_ : offset_cache_t, symbol_cache_t {}; + + __device__ __forceinline__ TempStorage_& PrivateStorage() + { + __shared__ TempStorage private_storage; + return private_storage.Alias(); + } + TempStorage_& temp_storage; + + public: + struct TempStorage : cub::Uninitialized {}; + + __device__ __forceinline__ WriteCoalescingCallbackWrapper(TransducerTableT transducer_table, + TransducedOutItT out_it, + TransducedIndexOutItT out_idx_it, + uint32_t thread_out_offset, + uint32_t tile_out_offset, + uint32_t tile_in_offset, + uint32_t tile_out_count) + : temp_storage(PrivateStorage()), + transducer_table(transducer_table), + out_it(out_it), + out_idx_it(out_idx_it), + thread_out_offset(thread_out_offset), + tile_out_offset(tile_out_offset), + tile_in_offset(tile_in_offset), + tile_out_count(tile_out_count) + { + } + + template + __device__ __forceinline__ void Init(OffsetT const& offset) + { + this->in_offset = offset; + } + + template + __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, + StateIndexT const old_state, + StateIndexT const new_state, + SymbolIndexT const symbol_id, + SymbolT const read_symbol) { uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); - if (write) { -#if defined(__CUDA_ARCH__) -#pragma unroll 1 -#endif - for (uint32_t out_char = 0; out_char < count; out_char++) { - out_it[out_count + out_char] = + for (uint32_t out_char = 0; out_char < count; out_char++) { + if constexpr (!DiscardIndexOutput) { + temp_storage.compacted_offset[thread_out_offset + out_char - tile_out_offset] = + in_offset + character_index - tile_in_offset; + } + if constexpr (!DiscardTranslatedOutput) { + temp_storage.compacted_symbols[thread_out_offset + out_char - tile_out_offset] = transducer_table(old_state, symbol_id, out_char, read_symbol); - out_idx_it[out_count + out_char] = offset + character_index; } } - out_count += count; + thread_out_offset += count; } - __host__ __device__ __forceinline__ void TearDown() {} + __device__ __forceinline__ void TearDown() + { + __syncthreads(); + if constexpr (!DiscardTranslatedOutput) { + for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { + out_it[tile_out_offset + out_char] = temp_storage.compacted_symbols[out_char]; + } + } + if constexpr (!DiscardIndexOutput) { + for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { + out_idx_it[tile_out_offset + out_char] = + temp_storage.compacted_offset[out_char] + tile_in_offset; + } + } + __syncthreads(); + } public: TransducerTableT const transducer_table; TransducedOutItT out_it; TransducedIndexOutItT out_idx_it; - uint32_t out_count; - uint32_t offset; - bool write; + uint32_t thread_out_offset; + uint32_t tile_out_offset; + uint32_t tile_in_offset; + uint32_t in_offset; + uint32_t tile_out_count; }; /** @@ -125,17 +342,18 @@ class DFASimulationCallbackWrapper { template class StateVectorTransitionOp { public: - __host__ __device__ __forceinline__ StateVectorTransitionOp( + __device__ __forceinline__ StateVectorTransitionOp( TransitionTableT const& transition_table, std::array& state_vector) : transition_table(transition_table), state_vector(state_vector) { } template - __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, - SymbolIndexT const& read_symbol_id, - SymbolT const& read_symbol) const + __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, + SymbolIndexT const& read_symbol_id, + SymbolT const& read_symbol) const { +#pragma unroll for (int32_t i = 0; i < NUM_INSTANCES; ++i) { state_vector[i] = transition_table(state_vector[i], read_symbol_id); } @@ -152,17 +370,17 @@ struct StateTransitionOp { TransitionTableT const& transition_table; CallbackOpT& callback_op; - __host__ __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table, - StateIndexT state, - CallbackOpT& callback_op) + __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table, + StateIndexT state, + CallbackOpT& callback_op) : transition_table(transition_table), state(state), callback_op(callback_op) { } template - __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, - SymbolIndexT const& read_symbol_id, - SymbolT const& read_symbol) + __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, + SymbolIndexT const& read_symbol_id, + SymbolT const& read_symbol) { // Remember what state we were in before we made the transition StateIndexT previous_state = state; @@ -420,7 +638,7 @@ struct AgentDFA { __syncthreads(); // Thread's symbols - CharT* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD]; + CharT const* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD]; // Parse thread's symbols and transition the state-vector if (is_full_block) { @@ -538,6 +756,43 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL // The state transition vector passed on to the second stage of the algorithm StateVectorT out_state_vector; + using OutSymbolT = typename DfaT::OutSymbolT; + // static constexpr int32_t MIN_TRANSLATED_OUT = DfaT::MIN_TRANSLATED_OUT; + static constexpr int32_t num_max_translated_out = DfaT::MAX_TRANSLATED_OUT; + static constexpr bool discard_out_index = + ::cuda::std::is_same>::value; + static constexpr bool discard_out_it = + ::cuda::std::is_same>::value; + using NonWriteCoalescingT = + DFAWriteCallbackWrapper; + + using WriteCoalescingT = + WriteCoalescingCallbackWrapper; + + static constexpr bool is_translation_pass = (!IS_TRANS_VECTOR_PASS) || IS_SINGLE_PASS; + + // Use write-coalescing only if the worst-case output size per tile fits into shared memory + static constexpr bool can_use_smem_cache = + (sizeof(typename WriteCoalescingT::TempStorage) + sizeof(typename AgentDfaSimT::TempStorage) + + sizeof(typename DfaT::SymbolGroupStorageT) + sizeof(typename DfaT::TransitionTableStorageT) + + sizeof(typename DfaT::TranslationTableStorageT)) < (48 * 1024); + static constexpr bool use_smem_cache = + is_translation_pass and + (sizeof(typename WriteCoalescingT::TempStorage) <= AgentDFAPolicy::SMEM_THRESHOLD) and + can_use_smem_cache; + + using DFASimulationCallbackWrapperT = + typename cub::If::Type; + // Stage 1: Compute the state-transition vector if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) { // Keeping track of the state for each of the state machines @@ -576,7 +831,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL // -> first block/tile: write out block aggregate as the "tile's" inclusive (i.e., the one that // incorporates all preceding blocks/tiles results) //------------------------------------------------------------------------------ - if (IS_SINGLE_PASS) { + if constexpr (IS_SINGLE_PASS) { uint32_t tile_idx = blockIdx.x; using StateVectorCompositeOpT = VectorCompositeOp; @@ -623,10 +878,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL } // Perform finite-state machine simulation, computing size of transduced output - DFASimulationCallbackWrapper - callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it); + DFACountCallbackWrapper count_chars_callback_op{transducer_table}; StateIndexT t_start_state = state; agent_dfa.GetThreadStateTransitions(symbol_matcher, @@ -635,7 +887,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL blockIdx.x * SYMBOLS_PER_BLOCK, num_chars, state, - callback_wrapper, + count_chars_callback_op, cub::Int2Type()); __syncthreads(); @@ -650,15 +902,18 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL __shared__ typename OffsetPrefixScanCallbackOpT_::TempStorage prefix_callback_temp_storage; uint32_t tile_idx = blockIdx.x; + uint32_t tile_out_offset{}; + uint32_t tile_out_count{}; + uint32_t thread_out_offset{}; if (tile_idx == 0) { OffsetT block_aggregate = 0; OutOffsetBlockScan(scan_temp_storage) - .ExclusiveScan(callback_wrapper.out_count, - callback_wrapper.out_count, + .ExclusiveScan(count_chars_callback_op.out_count, + thread_out_offset, static_cast(0), cub::Sum{}, block_aggregate); - + tile_out_count = block_aggregate; if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) { offset_tile_state.SetInclusive(0, block_aggregate); } @@ -671,22 +926,28 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL offset_tile_state, prefix_callback_temp_storage, cub::Sum{}, tile_idx); OutOffsetBlockScan(scan_temp_storage) - .ExclusiveScan( - callback_wrapper.out_count, callback_wrapper.out_count, cub::Sum{}, prefix_op); - + .ExclusiveScan(count_chars_callback_op.out_count, thread_out_offset, cub::Sum{}, prefix_op); + tile_out_offset = prefix_op.GetExclusivePrefix(); + tile_out_count = prefix_op.GetBlockAggregate(); if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) { *d_num_transduced_out_it = prefix_op.GetInclusivePrefix(); } } - callback_wrapper.write = true; + DFASimulationCallbackWrapperT write_translated_callback_op{transducer_table, + transduced_out_it, + transduced_out_idx_it, + thread_out_offset, + tile_out_offset, + blockIdx.x * SYMBOLS_PER_BLOCK, + tile_out_count}; agent_dfa.GetThreadStateTransitions(symbol_matcher, transition_table, d_chars, blockIdx.x * SYMBOLS_PER_BLOCK, num_chars, t_start_state, - callback_wrapper, + write_translated_callback_op, cub::Int2Type()); } } diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index be63ec6539f..ef5e9c8a78f 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -37,6 +37,11 @@ struct AgentDFAPolicy { // The number of symbols processed by each thread static constexpr int32_t ITEMS_PER_THREAD = _ITEMS_PER_THREAD; + + // If the shared memory-backed write buffer exceeds this threshold, the FST will skip buffering + // the output in a write buffer and instead immediately write out to global memory, potentially + // resulting in non-coalesced writes + static constexpr std::size_t SMEM_THRESHOLD = 24 * 1024; }; /** @@ -49,7 +54,7 @@ struct DeviceFSMPolicy { struct Policy900 : cub::ChainedPolicy<900, Policy900, Policy900> { enum { BLOCK_THREADS = 128, - ITEMS_PER_THREAD = 32, + ITEMS_PER_THREAD = 16, }; using AgentDFAPolicy = AgentDFAPolicy; diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 5532a7f994b..ae1f81fd541 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -367,18 +367,18 @@ class TransitionTable { template static KernelParameter InitDeviceTransitionTable( - std::array, MAX_NUM_STATES> const& translation_table) + std::array, MAX_NUM_STATES> const& transition_table) { KernelParameter init_data{}; - // translation_table[state][symbol] -> new state - for (std::size_t state = 0; state < translation_table.size(); ++state) { - for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) { + // transition_table[state][symbol] -> new state + for (std::size_t state = 0; state < transition_table.size(); ++state) { + for (std::size_t symbol = 0; symbol < transition_table[state].size(); ++symbol) { CUDF_EXPECTS( - static_cast(translation_table[state][symbol]) <= + static_cast(transition_table[state][symbol]) <= std::numeric_limits::max(), "Target state index value exceeds value representable by the transition table's type"); init_data.transitions[symbol * MAX_NUM_STATES + state] = - static_cast(translation_table[state][symbol]); + static_cast(transition_table[state][symbol]); } } @@ -494,6 +494,10 @@ class dfa_device_view { // This is a value queried by the DFA simulation algorithm static constexpr int32_t MAX_NUM_STATES = NUM_STATES; + using OutSymbolT = typename TranslationTableT::OutSymbolT; + static constexpr int32_t MIN_TRANSLATED_OUT = TranslationTableT::MIN_TRANSLATED_OUT; + static constexpr int32_t MAX_TRANSLATED_OUT = TranslationTableT::MAX_TRANSLATED_OUT; + using SymbolGroupStorageT = std::conditional_t::value, typename SymbolGroupIdLookupT::TempStorage, typename cub::NullType>; @@ -542,24 +546,33 @@ class dfa_device_view { * @tparam OutSymbolT The symbol type being output * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output * symbols - * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_SYMBOLS The maximum number of symbol groups supported by this lookup table * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support + * @tparam MIN_TRANSLATED_OUT_ The minimum number of symbols being output by a single state + * transition + * @tparam MAX_TRANSLATED_OUT_ The maximum number of symbols being output by a single state + * transition * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols - * be used. */ -template class TransducerLookupTable { private: struct _TempStorage { OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; - OutSymbolT out_symbols[MAX_TABLE_SIZE]; + OutSymbolT_ out_symbols[MAX_TABLE_SIZE]; }; public: + using OutSymbolT = OutSymbolT_; + static constexpr int32_t MIN_TRANSLATED_OUT = MIN_TRANSLATED_OUT_; + static constexpr int32_t MAX_TRANSLATED_OUT = MAX_TRANSLATED_OUT_; + using TempStorage = cub::Uninitialized<_TempStorage>; struct KernelParameter { @@ -567,6 +580,8 @@ class TransducerLookupTable { OutSymbolOffsetT, MAX_NUM_SYMBOLS, MAX_NUM_STATES, + MIN_TRANSLATED_OUT, + MAX_TRANSLATED_OUT, MAX_TABLE_SIZE>; OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; @@ -686,14 +701,19 @@ class TransducerLookupTable { * sequence of symbols that the finite-state transducer is supposed to output for each transition. * * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols - * be used + * @tparam MIN_TRANSLATED_OUT The minimum number of symbols being output by a single state + * transition + * @tparam MAX_TRANSLATED_OUT The maximum number of symbols being output by a single state + * transition * @tparam OutSymbolT The symbol type being output - * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_SYMBOLS The maximum number of symbol groups supported by this lookup table * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support * @param translation_table The translation table * @return A translation table of type `TransducerLookupTable`. */ template @@ -705,20 +725,30 @@ auto make_translation_table(std::array, MAX_N OutSymbolOffsetT, MAX_NUM_SYMBOLS, MAX_NUM_STATES, + MIN_TRANSLATED_OUT, + MAX_TRANSLATED_OUT, MAX_TABLE_SIZE>; return translation_table_t::InitDeviceTranslationTable(translation_table); } -template +template class TranslationOp { private: struct _TempStorage {}; public: + using OutSymbolT = OutSymbolT_; + static constexpr int32_t MIN_TRANSLATED_OUT = MIN_TRANSLATED_OUT_; + static constexpr int32_t MAX_TRANSLATED_OUT = MAX_TRANSLATED_OUT_; + using TempStorage = cub::Uninitialized<_TempStorage>; struct KernelParameter { - using LookupTableT = TranslationOp; + using LookupTableT = + TranslationOp; TranslationOpT translation_op; }; @@ -772,6 +802,10 @@ class TranslationOp { * * @tparam FunctorT A function object type that must implement two signatures: (1) with `(state_id, * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)` + * @tparam MIN_TRANSLATED_SYMBOLS The minimum number of translated output symbols for any given + * input symbol + * @tparam MAX_TRANSLATED_SYMBOLS The maximum number of translated output symbols for any given + * input symbol * @param map_op A function object that must implement two signatures: (1) with `(state_id, * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`. * Invocations of the first signature, (1), must return the number of symbols that are emitted for @@ -779,10 +813,14 @@ class TranslationOp { * that transition, where `i` corresponds to `relative_offse` * @return A translation table of type `TranslationO` */ -template +template auto make_translation_functor(FunctorT map_op) { - return TranslationOp::InitDeviceTranslationTable(map_op); + return TranslationOp:: + InitDeviceTranslationTable(map_op); } /** diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index ca56a12eb36..760b2214365 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -302,11 +302,14 @@ void normalize_single_quotes(datasource::owning_buffer( + normalize_quotes::TransduceToNormalizedQuotes{}), + stream); rmm::device_uvector outbuf(indata.size() * 2, stream, mr); rmm::device_scalar outbuf_size(stream, mr); @@ -327,11 +330,14 @@ void normalize_whitespace(datasource::owning_buffer rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - auto parser = fst::detail::make_fst( - fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs), - fst::detail::make_transition_table(normalize_whitespace::wna_state_tt), - fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}), - stream); + static constexpr std::int32_t min_out = 0; + static constexpr std::int32_t max_out = 2; + auto parser = + fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs), + fst::detail::make_transition_table(normalize_whitespace::wna_state_tt), + fst::detail::make_translation_functor( + normalize_whitespace::TransduceToNormalizedWS{}), + stream); rmm::device_uvector outbuf(indata.size(), stream, mr); rmm::device_scalar outbuf_size(stream, mr); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index a007754ef4f..8decaf034f3 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1455,11 +1455,14 @@ void get_stack_context(device_span json_in, constexpr auto max_translation_table_size = to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES; - auto json_to_stack_ops_fst = fst::detail::make_fst( + static constexpr auto min_translated_out = 0; + static constexpr auto max_translated_out = 1; + auto json_to_stack_ops_fst = fst::detail::make_fst( fst::detail::make_symbol_group_lut(to_stack_op::get_sgid_lut(delimiter)), fst::detail::make_transition_table(to_stack_op::get_transition_table(stack_behavior)), - fst::detail::make_translation_table( - to_stack_op::get_translation_table(stack_behavior)), + fst::detail:: + make_translation_table( + to_stack_op::get_translation_table(stack_behavior)), stream); // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end @@ -1507,11 +1510,12 @@ std::pair, rmm::device_uvector> pr // Instantiate FST for post-processing the token stream to remove all tokens that belong to an // invalid JSON line token_filter::UnwrapTokenFromSymbolOp sgid_op{}; - auto filter_fst = - fst::detail::make_fst(fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op), - fst::detail::make_transition_table(token_filter::transition_table), - fst::detail::make_translation_functor(token_filter::TransduceToken{}), - stream); + using symbol_t = thrust::tuple; + auto filter_fst = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op), + fst::detail::make_transition_table(token_filter::transition_table), + fst::detail::make_translation_functor(token_filter::TransduceToken{}), + stream); auto const mr = rmm::mr::get_current_device_resource(); rmm::device_scalar d_num_selected_tokens(stream, mr); @@ -1598,7 +1602,8 @@ std::pair, rmm::device_uvector> ge fst::detail::make_symbol_group_lookup_op( fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{delimiter}), fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table), - fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}), + fst::detail::make_translation_functor( + fix_stack_of_excess_chars::TransduceInputOp{}), stream); fix_stack_of_excess_chars.Transduce(zip_in, static_cast(json_in.size()), @@ -1619,7 +1624,7 @@ std::pair, rmm::device_uvector> ge auto json_to_tokens_fst = fst::detail::make_fst( fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{delimiter}), fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)), - fst::detail::make_translation_table( + fst::detail::make_translation_table( tokenizer_pda::get_translation_table(recover_from_error)), stream); diff --git a/cpp/tests/io/fst/common.hpp b/cpp/tests/io/fst/common.hpp index 382d21fabb8..0177300eda9 100644 --- a/cpp/tests/io/fst/common.hpp +++ b/cpp/tests/io/fst/common.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -69,6 +69,8 @@ std::array, TT_NUM_STATES> const pda_s /* TT_ESC */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}}; // Translation table (i.e., for each transition, what are the symbols that we output) +static constexpr auto min_translated_out = 1; +static constexpr auto max_translated_out = 1; std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const pda_out_tt{ {/* IN_STATE { [ } ] " \ OTHER */ /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}}, diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 4df0d3ae04d..8a8d3d39e0f 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -169,7 +169,9 @@ TEST_F(FstTest, GroundTruth) auto parser = cudf::io::fst::detail::make_fst( cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), cudf::io::fst::detail::make_transition_table(pda_state_tt), - cudf::io::fst::detail::make_translation_table(pda_out_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), stream); // Allocate device-side temporary storage & run algorithm From 39f256c3397afc9c495cb819636abddb23f81dc0 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 23 Jul 2024 19:03:16 -0500 Subject: [PATCH 06/15] Fall back to CPU for unsupported libcudf binaryops in cudf-polars (#16188) This PR adds logic that should trigger CPU fallback unsupported binary ops. Authors: - https://github.com/brandon-b-miller - Lawrence Mitchell (https://github.com/wence-) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16188 --- python/cudf_polars/cudf_polars/dsl/expr.py | 13 ++++--- .../cudf_polars/cudf_polars/utils/dtypes.py | 38 +------------------ .../tests/expressions/test_literal.py | 18 ++++++--- 3 files changed, 21 insertions(+), 48 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 9835e6f8461..6325feced94 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -1424,13 +1424,14 @@ def __init__( super().__init__(dtype) self.op = op self.children = (left, right) - if ( - op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB) - and plc.traits.is_chrono(left.dtype) - and plc.traits.is_chrono(right.dtype) - and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id()) + if not plc.binaryop.is_supported_operation( + self.dtype, left.dtype, right.dtype, op ): - raise NotImplementedError("Casting rules for timelike types") + raise NotImplementedError( + f"Operation {op.name} not supported " + f"for types {left.dtype.id().name} and {right.dtype.id().name} " + f"with output type {self.dtype.id().name}" + ) _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = { pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL, diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py index 1279fe91d48..cd68d021286 100644 --- a/python/cudf_polars/cudf_polars/utils/dtypes.py +++ b/python/cudf_polars/cudf_polars/utils/dtypes.py @@ -14,43 +14,7 @@ import cudf._lib.pylibcudf as plc -__all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"] - - -def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId): - """ - Do two datetime typeids have matching resolution for a binop. - - Parameters - ---------- - lid - Left type id - rid - Right type id - - Returns - ------- - True if resolutions are compatible, False otherwise. - - Notes - ----- - Polars has different casting rules for combining - datetimes/durations than libcudf, and while we don't encode the - casting rules fully, just reject things we can't handle. - - Precondition for correctness: both lid and rid are timelike. - """ - if lid == rid: - return True - # Timestamps are smaller than durations in the libcudf enum. - lid, rid = sorted([lid, rid]) - if lid == plc.TypeId.TIMESTAMP_MILLISECONDS: - return rid == plc.TypeId.DURATION_MILLISECONDS - elif lid == plc.TypeId.TIMESTAMP_MICROSECONDS: - return rid == plc.TypeId.DURATION_MICROSECONDS - elif lid == plc.TypeId.TIMESTAMP_NANOSECONDS: - return rid == plc.TypeId.DURATION_NANOSECONDS - return False +__all__ = ["from_polars", "downcast_arrow_lists"] def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType: diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py index 55e688428bd..5bd3131d1d7 100644 --- a/python/cudf_polars/tests/expressions/test_literal.py +++ b/python/cudf_polars/tests/expressions/test_literal.py @@ -6,6 +6,8 @@ import polars as pl +import cudf._lib.pylibcudf as plc + from cudf_polars.testing.asserts import ( assert_gpu_result_equal, assert_ir_translation_raises, @@ -64,11 +66,17 @@ def test_timelike_literal(timestamp, timedelta): adjusted=timestamp + timedelta, two_delta=timedelta + timedelta, ) - schema = q.collect_schema() - time_type = schema["time"] - delta_type = schema["delta"] - if dtypes.have_compatible_resolution( - dtypes.from_polars(time_type).id(), dtypes.from_polars(delta_type).id() + schema = {k: dtypes.from_polars(v) for k, v in q.collect_schema().items()} + if plc.binaryop.is_supported_operation( + schema["adjusted"], + schema["time"], + schema["delta"], + plc.binaryop.BinaryOperator.ADD, + ) and plc.binaryop.is_supported_operation( + schema["two_delta"], + schema["delta"], + schema["delta"], + plc.binaryop.BinaryOperator.ADD, ): assert_gpu_result_equal(q) else: From f0efc8b36a8f43cfa027966265dcea052bb5c45d Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 23 Jul 2024 17:17:05 -0700 Subject: [PATCH 07/15] Modify `make_host_vector` and `make_device_uvector` factories to optionally use pinned memory and kernel copy (#16206) Issue #15616 Modified `make_host_vector` functions to return `cudf::detail::host_vector`, which can use a pinned or a pageable memory resource. When pinned memory is used, the D2H copy is potentially done using a CUDA kernel. Also added factories to create `host_vector`s without device data. These are useful to replace uses of `std::vector` and `thrust::host_vector` when the data eventually gets copied to the GPU. Added `is_device_accessible` to `host_span`. With this, `make_device_uvector` can optionally use the kernel for the H2D copy. Modified `cudf::detail::host_vector` to be derived from `thrust::host_vector`, to avoid issues with implicit conversion from `std::vector`. Used `cudf::detail::host_vector` and its new factory functions wherever data ends up copied to the GPU. Stopped using `thrust::copy_n` for the kernel copy path in `cuda_memcpy` because of an optimization that allows it to fall back to `cudaMemCpyAsync`. We now call a simple local kernel. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) - Alessandro Bellina (https://github.com/abellina) URL: https://github.com/rapidsai/cudf/pull/16206 --- cpp/CMakeLists.txt | 2 +- cpp/include/cudf/detail/gather.cuh | 2 +- cpp/include/cudf/detail/null_mask.cuh | 4 +- .../cudf/detail/utilities/host_memory.hpp | 51 +++++++++ .../cudf/detail/utilities/host_vector.hpp | 24 +++- .../detail/utilities/vector_factories.hpp | 106 ++++++++++++------ cpp/include/cudf/io/text/detail/trie.hpp | 4 +- cpp/include/cudf/lists/detail/dremel.hpp | 10 +- cpp/include/cudf/utilities/pinned_memory.hpp | 16 +++ cpp/include/cudf/utilities/span.hpp | 32 ++++++ cpp/src/copying/concatenate.cu | 6 +- cpp/src/copying/contiguous_split.cu | 3 +- cpp/src/datetime/timezone.cpp | 6 +- cpp/src/dictionary/detail/concatenate.cu | 2 +- cpp/src/io/avro/reader_impl.cu | 8 +- cpp/src/io/csv/reader_impl.cu | 44 +++++--- cpp/src/io/json/json_column.cu | 4 +- cpp/src/io/json/nested_json_gpu.cu | 6 +- cpp/src/io/json/read_json.cu | 3 +- cpp/src/io/orc/reader_impl_decode.cu | 10 +- cpp/src/io/orc/stripe_enc.cu | 4 +- cpp/src/io/orc/writer_impl.cu | 50 +++++---- cpp/src/io/orc/writer_impl.hpp | 9 +- cpp/src/io/parquet/predicate_pushdown.cpp | 20 ++-- cpp/src/io/parquet/reader_impl_chunking.cu | 78 +++++++------ cpp/src/io/parquet/reader_impl_preprocess.cu | 10 +- cpp/src/io/parquet/writer_impl.cu | 7 +- cpp/src/lists/dremel.cu | 6 +- cpp/src/strings/combine/join.cu | 6 +- cpp/src/strings/convert/convert_datetime.cu | 2 +- cpp/src/strings/copying/concatenate.cu | 2 +- cpp/src/strings/filter_chars.cu | 2 +- cpp/src/strings/replace/multi_re.cu | 2 +- cpp/src/strings/translate.cu | 2 +- cpp/src/table/row_operators.cu | 5 +- cpp/src/utilities/cuda_memcpy.cu | 20 +++- .../{pinned_memory.cpp => host_memory.cpp} | 86 +++++++++++++- cpp/tests/io/json/json_tree.cpp | 6 +- cpp/tests/strings/integers_tests.cpp | 4 +- .../utilities_tests/pinned_memory_tests.cpp | 67 ++++++++++- 40 files changed, 539 insertions(+), 192 deletions(-) create mode 100644 cpp/include/cudf/detail/utilities/host_memory.hpp rename cpp/src/utilities/{pinned_memory.cpp => host_memory.cpp} (73%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5e79204a558..a2c2dd3af4c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -671,9 +671,9 @@ add_library( src/unary/null_ops.cu src/utilities/cuda_memcpy.cu src/utilities/default_stream.cpp + src/utilities/host_memory.cpp src/utilities/linked_column.cpp src/utilities/logger.cpp - src/utilities/pinned_memory.cpp src/utilities/prefetch.cpp src/utilities/stacktrace.cpp src/utilities/stream_pool.cpp diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 5977c7341c1..d3e9fc4974d 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -577,7 +577,7 @@ void gather_bitmask(table_view const& source, } // Make device array of target bitmask pointers - std::vector target_masks(target.size()); + auto target_masks = make_host_vector(target.size(), stream); std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) { return col->mutable_view().null_mask(); }); diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index e62675cbc8c..ae6db5409cc 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -430,7 +430,9 @@ std::vector segmented_count_bits(bitmask_type const* bitmask, if (num_segments == 0) { return std::vector{}; } // Construct a contiguous host buffer of indices and copy to device. - auto const h_indices = std::vector(indices_begin, indices_end); + auto h_indices = make_empty_host_vector::value_type>( + std::distance(indices_begin, indices_end), stream); + std::copy(indices_begin, indices_end, std::back_inserter(h_indices)); auto const d_indices = make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp new file mode 100644 index 00000000000..c6775a950c9 --- /dev/null +++ b/cpp/include/cudf/detail/utilities/host_memory.hpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace cudf::detail { +/** + * @brief Get the memory resource to be used for pageable memory allocations. + * + * @return Reference to the pageable memory resource + */ +CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource(); + +/** + * @brief Get the allocator to be used for the host memory allocation. + * + * @param size The number of elements of type T to allocate + * @param stream The stream to use for the allocation + * @return The allocator to be used for the host memory allocation + */ +template +rmm_host_allocator get_host_allocator(std::size_t size, rmm::cuda_stream_view stream) +{ + if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) { + return {get_pinned_memory_resource(), stream}; + } + return {get_pageable_memory_resource(), stream}; +} + +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index 2d14d0306cd..f4e5f718da4 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -61,6 +61,10 @@ class rmm_host_allocator { }; }; +template +inline constexpr bool contains_property = + (cuda::std::is_same_v || ... || false); + /*! \p rmm_host_allocator is a CUDA-specific host memory allocator * that employs \c `rmm::host_async_resource_ref` for allocation. * @@ -100,8 +104,12 @@ class rmm_host_allocator { /** * @brief Construct from a `cudf::host_async_resource_ref` */ - rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream) - : mr(_mr), stream(_stream) + template + rmm_host_allocator(cuda::mr::async_resource_ref _mr, + rmm::cuda_stream_view _stream) + : mr(_mr), + stream(_stream), + _is_device_accessible{contains_property} { } @@ -173,15 +181,25 @@ class rmm_host_allocator { */ inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); } + bool is_device_accessible() const { return _is_device_accessible; } + private: rmm::host_async_resource_ref mr; rmm::cuda_stream_view stream; + bool _is_device_accessible; }; /** * @brief A vector class with rmm host memory allocator */ template -using host_vector = thrust::host_vector>; +class host_vector : public thrust::host_vector> { + public: + using base = thrust::host_vector>; + + host_vector(rmm_host_allocator const& alloc) : base(alloc) {} + + host_vector(size_t size, rmm_host_allocator const& alloc) : base(size, alloc) {} +}; } // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 20cb55bb1c7..45dc839c9bd 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -21,6 +21,8 @@ * @file vector_factories.hpp */ +#include +#include #include #include #include @@ -32,8 +34,6 @@ #include #include -#include - #include namespace cudf { @@ -100,11 +100,12 @@ rmm::device_uvector make_device_uvector_async(host_span source_data, rmm::device_async_resource_ref mr) { rmm::device_uvector ret(source_data.size(), stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(), - source_data.data(), - source_data.size() * sizeof(T), - cudaMemcpyDefault, - stream.value())); + auto const is_pinned = source_data.is_device_accessible(); + cuda_memcpy_async(ret.data(), + source_data.data(), + source_data.size() * sizeof(T), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); return ret; } @@ -271,21 +272,11 @@ rmm::device_uvector make_device_uvector_sync( return make_device_uvector_sync(device_span{c}, stream, mr); } -// Utility function template to allow copying to either a thrust::host_vector or std::vector -template -OutContainer make_vector_async(device_span v, rmm::cuda_stream_view stream) -{ - OutContainer result(v.size()); - CUDF_CUDA_TRY(cudaMemcpyAsync( - result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value())); - return result; -} - /** * @brief Asynchronously construct a `std::vector` containing a copy of data from a * `device_span` * - * @note This function does not synchronize `stream`. + * @note This function does not synchronize `stream` after the copy. * * @tparam T The type of the data to copy * @param source_data The device data to copy @@ -295,14 +286,17 @@ OutContainer make_vector_async(device_span v, rmm::cuda_stream_view str template std::vector make_std_vector_async(device_span v, rmm::cuda_stream_view stream) { - return make_vector_async>(v, stream); + std::vector result(v.size()); + CUDF_CUDA_TRY(cudaMemcpyAsync( + result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value())); + return result; } /** * @brief Asynchronously construct a `std::vector` containing a copy of data from a device * container * - * @note This function synchronizes `stream`. + * @note This function synchronizes `stream` after the copy. * * @tparam Container The type of the container to copy from * @tparam T The type of the data to copy @@ -324,7 +318,7 @@ std::vector make_std_vector_async(Container cons * @brief Synchronously construct a `std::vector` containing a copy of data from a * `device_span` * - * @note This function does a synchronize on `stream`. + * @note This function does a synchronize on `stream` after the copy. * * @tparam T The type of the data to copy * @param source_data The device data to copy @@ -361,11 +355,46 @@ std::vector make_std_vector_sync(Container const return make_std_vector_sync(device_span{c}, stream); } +/** + * @brief Construct a `cudf::detail::host_vector` of the given size. + * + * @note The returned vector may be using a pinned memory resource. + * + * @tparam T The type of the vector data + * @param size The number of elements in the created vector + * @param stream The stream on which to allocate memory + * @return A host_vector of the given size + */ +template +host_vector make_host_vector(size_t size, rmm::cuda_stream_view stream) +{ + return host_vector(size, get_host_allocator(size, stream)); +} + +/** + * @brief Construct an empty `cudf::detail::host_vector` with the given capacity. + * + * @note The returned vector may be using a pinned memory resource. + * + * @tparam T The type of the vector data + * @param capacity Initial capacity of the vector + * @param stream The stream on which to allocate memory + * @return A host_vector with the given capacity + */ +template +host_vector make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream) +{ + auto result = host_vector(get_host_allocator(capacity, stream)); + result.reserve(capacity); + return result; +} + /** * @brief Asynchronously construct a `thrust::host_vector` containing a copy of data from a * `device_span` * - * @note This function does not synchronize `stream`. + * @note This function does not synchronize `stream` after the copy. The returned vector may be + * using a pinned memory resource. * * @tparam T The type of the data to copy * @param source_data The device data to copy @@ -373,16 +402,24 @@ std::vector make_std_vector_sync(Container const * @return The data copied to the host */ template -thrust::host_vector make_host_vector_async(device_span v, rmm::cuda_stream_view stream) +host_vector make_host_vector_async(device_span v, rmm::cuda_stream_view stream) { - return make_vector_async>(v, stream); + auto result = make_host_vector(v.size(), stream); + auto const is_pinned = result.get_allocator().is_device_accessible(); + cuda_memcpy_async(result.data(), + v.data(), + v.size() * sizeof(T), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); + return result; } /** * @brief Asynchronously construct a `std::vector` containing a copy of data from a device * container * - * @note This function does not synchronize `stream`. + * @note This function does not synchronize `stream` after the copy. The returned vector may be + * using a pinned memory resource. * * @tparam Container The type of the container to copy from * @tparam T The type of the data to copy @@ -394,8 +431,8 @@ template < typename Container, std::enable_if_t< std::is_convertible_v>>* = nullptr> -thrust::host_vector make_host_vector_async( - Container const& c, rmm::cuda_stream_view stream) +host_vector make_host_vector_async(Container const& c, + rmm::cuda_stream_view stream) { return make_host_vector_async(device_span{c}, stream); } @@ -404,7 +441,8 @@ thrust::host_vector make_host_vector_async( * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a * `device_span` * - * @note This function does a synchronize on `stream`. + * @note This function does a synchronize on `stream` after the copy. The returned vector may be + * using a pinned memory resource. * * @tparam T The type of the data to copy * @param source_data The device data to copy @@ -412,7 +450,7 @@ thrust::host_vector make_host_vector_async( * @return The data copied to the host */ template -thrust::host_vector make_host_vector_sync(device_span v, rmm::cuda_stream_view stream) +host_vector make_host_vector_sync(device_span v, rmm::cuda_stream_view stream) { auto result = make_host_vector_async(v, stream); stream.synchronize(); @@ -423,7 +461,7 @@ thrust::host_vector make_host_vector_sync(device_span v, rmm::cuda_s * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a device * container * - * @note This function synchronizes `stream`. + * @note This function synchronizes `stream` after the copy. * * @tparam Container The type of the container to copy from * @tparam T The type of the data to copy @@ -435,8 +473,8 @@ template < typename Container, std::enable_if_t< std::is_convertible_v>>* = nullptr> -thrust::host_vector make_host_vector_sync( - Container const& c, rmm::cuda_stream_view stream) +host_vector make_host_vector_sync(Container const& c, + rmm::cuda_stream_view stream) { return make_host_vector_sync(device_span{c}, stream); } @@ -444,7 +482,7 @@ thrust::host_vector make_host_vector_sync( /** * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size * - * @note This function may not synchronize `stream`. + * @note This function may not synchronize `stream` after the copy. * * @tparam T The type of the vector data * @param size The number of elements in the created vector @@ -460,7 +498,7 @@ host_vector make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea /** * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size * - * @note This function synchronizes `stream`. + * @note This function synchronizes `stream` after the copy. * * @tparam T The type of the vector data * @param size The number of elements in the created vector diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp index e0b9c7635e3..28862d97ede 100644 --- a/cpp/include/cudf/io/text/detail/trie.hpp +++ b/cpp/include/cudf/io/text/detail/trie.hpp @@ -223,11 +223,11 @@ struct trie { match_length.emplace_back(0); - std::vector trie_nodes; auto token_counts = std::unordered_map(); + auto trie_nodes = cudf::detail::make_empty_host_vector(tokens.size(), stream); for (uint32_t i = 0; i < tokens.size(); i++) { - trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]}); + trie_nodes.push_back(trie_node{tokens[i], match_length[i], transitions[i]}); token_counts[tokens[i]]++; } diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp index d36a4091947..53448424827 100644 --- a/cpp/include/cudf/lists/detail/dremel.hpp +++ b/cpp/include/cudf/lists/detail/dremel.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -31,8 +31,8 @@ struct dremel_device_view { size_type const* offsets; uint8_t const* rep_levels; uint8_t const* def_levels; - size_type const leaf_data_size; - uint8_t const max_def_level; + size_type leaf_data_size; + uint8_t max_def_level; }; /** @@ -45,8 +45,8 @@ struct dremel_data { rmm::device_uvector rep_level; rmm::device_uvector def_level; - size_type const leaf_data_size; - uint8_t const max_def_level; + size_type leaf_data_size; + uint8_t max_def_level; operator dremel_device_view() const { diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp index 3e2fa43cb50..fa7e1b35327 100644 --- a/cpp/include/cudf/utilities/pinned_memory.hpp +++ b/cpp/include/cudf/utilities/pinned_memory.hpp @@ -71,4 +71,20 @@ void set_kernel_pinned_copy_threshold(size_t threshold); */ size_t get_kernel_pinned_copy_threshold(); +/** + * @brief Set the threshold size for allocating host memory as pinned memory. + * + * @param threshold The threshold size in bytes. If the size of the allocation is less or equal to + * this threshold, the memory will be allocated as pinned memory. If the size is greater than this + * threshold, the memory will be allocated as pageable memory. + */ +void set_allocate_host_as_pinned_threshold(size_t threshold); + +/** + * @brief Get the threshold size for allocating host memory as pinned memory. + * + * @return The threshold size in bytes. + */ +size_t get_allocate_host_as_pinned_threshold(); + } // namespace cudf diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 3b35e60e034..c5054c733a7 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -16,6 +16,8 @@ #pragma once +#include + #include #include #include @@ -257,6 +259,26 @@ struct host_span : public cudf::detail::span_base>* = nullptr> + constexpr host_span(cudf::detail::host_vector& in) + : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()} + { + } + + /// Constructor from a const host_vector + /// @param in The host_vector to construct the span from + template >* = nullptr> + constexpr host_span(cudf::detail::host_vector const& in) + : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()} + { + } + // Copy construction to support const conversion /// @param other The span to copy template views, rmm::cuda_stream_vi }); // Assemble contiguous array of device views - auto device_views = thrust::host_vector(); - device_views.reserve(views.size()); + auto device_views = + cudf::detail::make_empty_host_vector(views.size(), stream); std::transform(device_view_owners.cbegin(), device_view_owners.cend(), std::back_inserter(device_views), @@ -84,7 +84,7 @@ auto create_device_views(host_span views, rmm::cuda_stream_vi make_device_uvector_async(device_views, stream, rmm::mr::get_current_device_resource()); // Compute the partition offsets - auto offsets = thrust::host_vector(views.size() + 1); + auto offsets = cudf::detail::make_host_vector(views.size() + 1, stream); thrust::transform_inclusive_scan( thrust::host, device_views.cbegin(), diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 37db2c74790..95544742fb7 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1539,7 +1539,8 @@ std::unique_ptr chunk_iteration_state::create( std::vector num_batches_per_iteration; std::vector size_of_batches_per_iteration; - std::vector accum_size_per_iteration; + auto accum_size_per_iteration = + cudf::detail::make_empty_host_vector(h_offsets.size(), stream); std::size_t accum_size = 0; { auto current_offset_it = h_offsets.begin(); diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index 1b0d201501b..7ca1b51df98 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -485,14 +485,12 @@ std::unique_ptr make_timezone_transition_table(std::optional ttimes_typed; - ttimes_typed.reserve(transition_times.size()); + auto ttimes_typed = make_empty_host_vector(transition_times.size(), stream); std::transform(transition_times.cbegin(), transition_times.cend(), std::back_inserter(ttimes_typed), [](auto ts) { return timestamp_s{duration_s{ts}}; }); - std::vector offsets_typed; - offsets_typed.reserve(offsets.size()); + auto offsets_typed = make_empty_host_vector(offsets.size(), stream); std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) { return duration_s{ts}; }); diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index fdc3d9d0ecf..72828309425 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -105,7 +105,7 @@ struct compute_children_offsets_fn { */ rmm::device_uvector create_children_offsets(rmm::cuda_stream_view stream) { - std::vector offsets(columns_ptrs.size()); + auto offsets = cudf::detail::make_host_vector(columns_ptrs.size(), stream); thrust::transform_exclusive_scan( thrust::host, columns_ptrs.begin(), diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 814efe2b5a1..69a0e982a5b 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -554,9 +554,11 @@ table_with_metadata read_avro(std::unique_ptr&& source, auto d_global_dict_data = rmm::device_uvector(0, stream); if (total_dictionary_entries > 0) { - auto h_global_dict = std::vector(total_dictionary_entries); - auto h_global_dict_data = std::vector(dictionary_data_size); - size_t dict_pos = 0; + auto h_global_dict = + cudf::detail::make_host_vector(total_dictionary_entries, stream); + auto h_global_dict_data = + cudf::detail::make_host_vector(dictionary_data_size, stream); + size_t dict_pos = 0; for (size_t i = 0; i < column_types.size(); ++i) { auto const col_idx = selected_columns[i].first; diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 05faded651d..40d4372ae9d 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -567,7 +567,7 @@ void infer_column_types(parse_options const& parse_opts, } std::vector decode_data(parse_options const& parse_opts, - std::vector const& column_flags, + host_span column_flags, std::vector const& column_names, device_span data, device_span row_offsets, @@ -592,8 +592,8 @@ std::vector decode_data(parse_options const& parse_opts, } } - thrust::host_vector h_data(num_active_columns); - thrust::host_vector h_valid(num_active_columns); + auto h_data = cudf::detail::make_host_vector(num_active_columns, stream); + auto h_valid = cudf::detail::make_host_vector(num_active_columns, stream); for (int i = 0; i < num_active_columns; ++i) { h_data[i] = out_buffers[i].data(); @@ -622,14 +622,16 @@ std::vector decode_data(parse_options const& parse_opts, return out_buffers; } -std::vector determine_column_types(csv_reader_options const& reader_opts, - parse_options const& parse_opts, - host_span column_names, - device_span data, - device_span row_offsets, - int32_t num_records, - host_span column_flags, - rmm::cuda_stream_view stream) +cudf::detail::host_vector determine_column_types( + csv_reader_options const& reader_opts, + parse_options const& parse_opts, + host_span column_names, + device_span data, + device_span row_offsets, + int32_t num_records, + host_span column_flags, + cudf::size_type num_active_columns, + rmm::cuda_stream_view stream) { std::vector column_types(column_flags.size()); @@ -653,7 +655,8 @@ std::vector determine_column_types(csv_reader_options const& reader_o stream); // compact column_types to only include active columns - std::vector active_col_types; + auto active_col_types = + cudf::detail::make_empty_host_vector(num_active_columns, stream); std::copy_if(column_types.cbegin(), column_types.cend(), std::back_inserter(active_col_types), @@ -697,8 +700,10 @@ table_with_metadata read_csv(cudf::io::datasource* source, auto const num_actual_columns = static_cast(column_names.size()); auto num_active_columns = num_actual_columns; - auto column_flags = std::vector( - num_actual_columns, column_parse::enabled | column_parse::inferred); + auto column_flags = + cudf::detail::make_host_vector(num_actual_columns, stream); + std::fill( + column_flags.begin(), column_flags.end(), column_parse::enabled | column_parse::inferred); // User did not pass column names to override names in the file // Process names from the file to remove empty and duplicated strings @@ -842,8 +847,15 @@ table_with_metadata read_csv(cudf::io::datasource* source, // Exclude the end-of-data row from number of rows with actual data auto const num_records = std::max(row_offsets.size(), 1ul) - 1; - auto const column_types = determine_column_types( - reader_opts, parse_opts, column_names, data, row_offsets, num_records, column_flags, stream); + auto const column_types = determine_column_types(reader_opts, + parse_opts, + column_names, + data, + row_offsets, + num_records, + column_flags, + num_active_columns, + stream); auto metadata = table_metadata{}; auto out_columns = std::vector>(); diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 3e587768b11..17fa7abdffe 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -622,7 +622,7 @@ void make_device_json_column(device_span input, // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking std::map, NodeIndexT> mapped_columns; // find column_ids which are values, but should be ignored in validity - std::vector ignore_vals(num_columns, 0); + auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); std::vector is_mixed_type_column(num_columns, 0); std::vector is_pruned(num_columns, 0); columns.try_emplace(parent_node_sentinel, std::ref(root)); @@ -812,7 +812,7 @@ void make_device_json_column(device_span input, return thrust::get<1>(a) < thrust::get<1>(b); }); // move columns data to device. - std::vector columns_data(num_columns); + auto columns_data = cudf::detail::make_host_vector(num_columns, stream); for (auto& [col_id, col_ref] : columns) { if (col_id == parent_node_sentinel) continue; auto& col = col_ref.get(); diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 8decaf034f3..1e484d74679 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1703,10 +1703,8 @@ void make_json_column(json_column& root_column, auto const [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr); // Copy the JSON tokens to the host - thrust::host_vector tokens = - cudf::detail::make_host_vector_async(d_tokens_gpu, stream); - thrust::host_vector token_indices_gpu = - cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); + auto tokens = cudf::detail::make_host_vector_async(d_tokens_gpu, stream); + auto token_indices_gpu = cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); // Make sure tokens have been copied to the host stream.synchronize(); diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 0ba4dedfc34..590f70864b1 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -78,10 +78,9 @@ device_span ingest_raw_input(device_span buffer, auto constexpr num_delimiter_chars = 1; if (compression == compression_type::NONE) { - std::vector delimiter_map{}; + auto delimiter_map = cudf::detail::make_empty_host_vector(sources.size(), stream); std::vector prefsum_source_sizes(sources.size()); std::vector> h_buffers; - delimiter_map.reserve(sources.size()); size_t bytes_read = 0; std::transform_inclusive_scan(sources.begin(), sources.end(), diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index 8e20505d3ff..e3b9a048be8 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -492,11 +492,17 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector const& if (num_stripes == 0) return; auto const num_columns = chunks.size().second; - std::vector> prefix_sums_to_update; + auto const num_struct_cols = + std::count_if(chunks[0].begin(), chunks[0].end(), [](auto const& chunk) { + return chunk.type_kind == STRUCT; + }); + auto prefix_sums_to_update = + cudf::detail::make_empty_host_vector>(num_struct_cols, + stream); for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) { // Null counts sums are only needed for children of struct columns if (chunks[0][col_idx].type_kind == STRUCT) { - prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx); + prefix_sums_to_update.push_back({col_idx, d_prefix_sums + num_stripes * col_idx}); } } auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async( diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 805959327ac..80f32512b98 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1417,8 +1417,8 @@ void decimal_sizes_to_offsets(device_2dspan rg_bounds, if (rg_bounds.count() == 0) return; // Convert map to a vector of views of the `elem_sizes` device buffers - std::vector h_sizes; - h_sizes.reserve(elem_sizes.size()); + auto h_sizes = + cudf::detail::make_empty_host_vector(elem_sizes.size(), stream); std::transform(elem_sizes.begin(), elem_sizes.end(), std::back_inserter(h_sizes), [](auto& p) { return decimal_column_element_sizes{p.first, p.second}; }); diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 4cb20bb7518..f3b8cfbc836 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -444,14 +444,17 @@ namespace { */ file_segmentation calculate_segmentation(host_span columns, hostdevice_2dvector&& rowgroup_bounds, - stripe_size_limits max_stripe_size) + stripe_size_limits max_stripe_size, + rmm::cuda_stream_view stream) { - std::vector infos; - auto const num_rowgroups = rowgroup_bounds.size().first; - size_t stripe_start = 0; - size_t stripe_bytes = 0; - size_type stripe_rows = 0; - for (size_t rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) { + // Number of stripes is not known in advance. Only reserve a single element to use pinned memory + // resource if at all enabled. + auto infos = cudf::detail::make_empty_host_vector(1, stream); + size_type const num_rowgroups = rowgroup_bounds.size().first; + size_type stripe_start = 0; + size_t stripe_bytes = 0; + size_type stripe_rows = 0; + for (size_type rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) { auto const rowgroup_total_bytes = std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const& col) { auto const rows = rowgroup_bounds[rg_idx][col.index()].size(); @@ -470,7 +473,9 @@ file_segmentation calculate_segmentation(host_span column // Check if adding the current rowgroup to the stripe will make the stripe too large or long if ((rg_idx > stripe_start) && (stripe_bytes + rowgroup_total_bytes > max_stripe_size.bytes || stripe_rows + rowgroup_rows_max > max_stripe_size.rows)) { - infos.emplace_back(infos.size(), stripe_start, rg_idx - stripe_start); + infos.push_back(stripe_rowgroups{static_cast(infos.size()), + stripe_start, + static_cast(rg_idx - stripe_start)}); stripe_start = rg_idx; stripe_bytes = 0; stripe_rows = 0; @@ -479,7 +484,9 @@ file_segmentation calculate_segmentation(host_span column stripe_bytes += rowgroup_total_bytes; stripe_rows += rowgroup_rows_max; if (rg_idx + 1 == num_rowgroups) { - infos.emplace_back(infos.size(), stripe_start, num_rowgroups - stripe_start); + infos.push_back(stripe_rowgroups{static_cast(infos.size()), + stripe_start, + static_cast(num_rowgroups - stripe_start)}); } } @@ -1336,7 +1343,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer, if (num_file_blobs == 0) { return {}; } // Create empty file stats and merge groups - std::vector h_stat_chunks(num_file_blobs); + auto h_stat_chunks = cudf::detail::make_host_vector(num_file_blobs, stream); cudf::detail::hostdevice_vector stats_merge(num_file_blobs, stream); // Fill in stats_merge and stat_chunks on the host for (auto i = 0u; i < num_file_blobs; ++i) { @@ -1677,39 +1684,39 @@ struct pushdown_null_masks { // Owning vector for masks in device memory std::vector> data; // Pointers to pushdown masks in device memory. Can be same for multiple columns. - std::vector masks; + cudf::detail::host_vector masks; }; pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table, rmm::cuda_stream_view stream) { - std::vector mask_ptrs; - mask_ptrs.reserve(orc_table.num_columns()); + auto mask_ptrs = + cudf::detail::make_empty_host_vector(orc_table.num_columns(), stream); std::vector> pd_masks; for (auto const& col : orc_table.columns) { // Leaf columns don't need pushdown masks if (col.num_children() == 0) { - mask_ptrs.emplace_back(nullptr); + mask_ptrs.push_back({nullptr}); continue; } auto const parent_pd_mask = col.is_child() ? mask_ptrs[col.parent_index()] : nullptr; auto const null_mask = col.null_mask(); if (null_mask == nullptr and parent_pd_mask == nullptr) { - mask_ptrs.emplace_back(nullptr); + mask_ptrs.push_back({nullptr}); continue; } if (col.orc_kind() == STRUCT) { if (null_mask != nullptr and parent_pd_mask == nullptr) { // Reuse own null mask - mask_ptrs.emplace_back(null_mask); + mask_ptrs.push_back(null_mask); } else if (null_mask == nullptr and parent_pd_mask != nullptr) { // Reuse parent's pushdown mask - mask_ptrs.emplace_back(parent_pd_mask); + mask_ptrs.push_back(parent_pd_mask); } else { // Both are nullable, allocate new pushdown mask pd_masks.emplace_back(num_bitmask_words(col.size()), stream); - mask_ptrs.emplace_back(pd_masks.back().data()); + mask_ptrs.push_back({pd_masks.back().data()}); thrust::transform(rmm::exec_policy(stream), null_mask, @@ -1724,7 +1731,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table, auto const child_col = orc_table.column(col.child_begin()[0]); // pushdown mask applies to child column(s); use the child column size pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream); - mask_ptrs.emplace_back(pd_masks.back().data()); + mask_ptrs.push_back({pd_masks.back().data()}); pushdown_lists_null_mask(col, orc_table.d_columns, parent_pd_mask, pd_masks.back(), stream); } } @@ -1815,8 +1822,7 @@ orc_table_view make_orc_table_view(table_view const& table, append_orc_column(table.column(col_idx), nullptr, table_meta.column_metadata[col_idx]); } - std::vector type_kinds; - type_kinds.reserve(orc_columns.size()); + auto type_kinds = cudf::detail::make_empty_host_vector(orc_columns.size(), stream); std::transform( orc_columns.cbegin(), orc_columns.cend(), std::back_inserter(type_kinds), [](auto& orc_column) { return orc_column.orc_kind(); @@ -2299,7 +2305,7 @@ auto convert_table_to_orc_data(table_view const& input, // Decide stripe boundaries based on rowgroups and char counts auto segmentation = - calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size); + calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size, stream); auto stripe_dicts = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream); auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream); diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index bd082befe0c..f5f8b3cfed9 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -78,10 +78,9 @@ struct orc_table_view { * Provides a container-like interface to iterate over rowgroup indices. */ struct stripe_rowgroups { - uint32_t id; // stripe id - uint32_t first; // first rowgroup in the stripe - uint32_t size; // number of rowgroups in the stripe - stripe_rowgroups(uint32_t id, uint32_t first, uint32_t size) : id{id}, first{first}, size{size} {} + size_type id; // stripe id + size_type first; // first rowgroup in the stripe + size_type size; // number of rowgroups in the stripe [[nodiscard]] auto cbegin() const { return thrust::make_counting_iterator(first); } [[nodiscard]] auto cend() const { return thrust::make_counting_iterator(first + size); } }; @@ -125,7 +124,7 @@ class orc_streams { */ struct file_segmentation { hostdevice_2dvector rowgroups; - std::vector stripes; + cudf::detail::host_vector stripes; auto num_rowgroups() const noexcept { return rowgroups.size().first; } auto num_stripes() const noexcept { return stripes.size(); } diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 11f4a00ee8b..481c1e9fcdd 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -141,11 +141,11 @@ struct stats_caster { // Local struct to hold host columns struct host_column { // using thrust::host_vector because std::vector uses bitmap instead of byte per bool. - thrust::host_vector val; + cudf::detail::host_vector val; std::vector null_mask; cudf::size_type null_count = 0; - host_column(size_type total_row_groups) - : val(total_row_groups), + host_column(size_type total_row_groups, rmm::cuda_stream_view stream) + : val{cudf::detail::make_host_vector(total_row_groups, stream)}, null_mask( cudf::util::div_rounding_up_safe( cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)), @@ -170,8 +170,14 @@ struct stats_caster { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - std::vector chars{}; - std::vector offsets(1, 0); + auto const total_char_count = std::accumulate( + host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) { + return sum + str.size_bytes(); + }); + auto chars = cudf::detail::make_empty_host_vector(total_char_count, stream); + auto offsets = + cudf::detail::make_empty_host_vector(host_strings.size() + 1, stream); + offsets.push_back(0); for (auto const& str : host_strings) { auto tmp = str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes()); @@ -206,8 +212,8 @@ struct stats_caster { null_count); } }; // local struct host_column - host_column min(total_row_groups); - host_column max(total_row_groups); + host_column min(total_row_groups, stream); + host_column max(total_row_groups, stream); size_type stats_idx = 0; for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) { for (auto const rg_idx : row_group_indices[src_idx]) { diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 05e0d8c0111..794750ab6d2 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -804,16 +804,16 @@ std::vector compute_page_splits_by_row(device_span> comp_in; - comp_in.reserve(num_comp_pages); - std::vector> comp_out; - comp_out.reserve(num_comp_pages); + auto comp_in = + cudf::detail::make_empty_host_vector>(num_comp_pages, stream); + auto comp_out = + cudf::detail::make_empty_host_vector>(num_comp_pages, stream); // vectors to save v2 def and rep level data, if any - std::vector> copy_in; - copy_in.reserve(num_comp_pages); - std::vector> copy_out; - copy_out.reserve(num_comp_pages); + auto copy_in = + cudf::detail::make_empty_host_vector>(num_comp_pages, stream); + auto copy_out = + cudf::detail::make_empty_host_vector>(num_comp_pages, stream); rmm::device_uvector comp_res(num_comp_pages, stream); thrust::fill(rmm::exec_policy_nosync(stream), @@ -822,7 +822,6 @@ std::vector compute_page_splits_by_row(device_span compute_page_splits_by_row(device_span(offset)}); + copy_out.push_back({dst_base, static_cast(offset)}); } - comp_in.emplace_back(page.page_data + offset, - static_cast(page.compressed_page_size - offset)); - comp_out.emplace_back(dst_base + offset, - static_cast(page.uncompressed_page_size - offset)); + comp_in.push_back( + {page.page_data + offset, static_cast(page.compressed_page_size - offset)}); + comp_out.push_back( + {dst_base + offset, static_cast(page.uncompressed_page_size - offset)}); page.page_data = dst_base; decomp_offset += page.uncompressed_page_size; }); + } + auto d_comp_in = cudf::detail::make_device_uvector_async( + comp_in, stream, rmm::mr::get_current_device_resource()); + auto d_comp_out = cudf::detail::make_device_uvector_async( + comp_out, stream, rmm::mr::get_current_device_resource()); + + int32_t start_pos = 0; + for (auto const& codec : codecs) { + if (codec.num_pages == 0) { continue; } + + device_span const> d_comp_in_view{d_comp_in.data() + start_pos, + codec.num_pages}; + + device_span const> d_comp_out_view(d_comp_out.data() + start_pos, + codec.num_pages); - host_span const> comp_in_view{comp_in.data() + start_pos, - codec.num_pages}; - auto const d_comp_in = cudf::detail::make_device_uvector_async( - comp_in_view, stream, rmm::mr::get_current_device_resource()); - host_span const> comp_out_view(comp_out.data() + start_pos, - codec.num_pages); - auto const d_comp_out = cudf::detail::make_device_uvector_async( - comp_out_view, stream, rmm::mr::get_current_device_resource()); device_span d_comp_res_view(comp_res.data() + start_pos, codec.num_pages); switch (codec.compression_type) { case GZIP: - gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream); + gpuinflate( + d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream); break; case SNAPPY: if (cudf::io::nvcomp_integration::is_stable_enabled()) { nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, - d_comp_in, - d_comp_out, + d_comp_in_view, + d_comp_out_view, d_comp_res_view, codec.max_decompressed_size, codec.total_decomp_size, stream); } else { - gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream); + gpu_unsnap(d_comp_in_view, d_comp_out, d_comp_res_view, stream); } break; case ZSTD: nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, - d_comp_in, - d_comp_out, + d_comp_in_view, + d_comp_out_view, d_comp_res_view, codec.max_decompressed_size, codec.total_decomp_size, stream); break; case BROTLI: - gpu_debrotli(d_comp_in, - d_comp_out, + gpu_debrotli(d_comp_in_view, + d_comp_out_view, d_comp_res_view, debrotli_scratch.data(), debrotli_scratch.size(), @@ -893,8 +900,8 @@ std::vector compute_page_splits_by_row(device_span chunk decomp_sum{}); // retrieve to host so we can call nvcomp to get compression scratch sizes - std::vector h_decomp_info = - cudf::detail::make_std_vector_sync(decomp_info, stream); - std::vector temp_cost(pages.size()); + auto h_decomp_info = cudf::detail::make_host_vector_sync(decomp_info, stream); + auto temp_cost = cudf::detail::make_host_vector(pages.size(), stream); thrust::transform(thrust::host, h_decomp_info.begin(), h_decomp_info.end(), diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index ff47dfc4cf3..e006cc7d714 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -370,7 +370,7 @@ void fill_in_page_info(host_span chunks, rmm::cuda_stream_view stream) { auto const num_pages = pages.size(); - std::vector page_indexes(num_pages); + auto page_indexes = cudf::detail::make_host_vector(num_pages, stream); for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { auto const& chunk = chunks[c]; @@ -1031,8 +1031,8 @@ struct get_page_num_rows { }; struct input_col_info { - int const schema_idx; - size_type const nesting_depth; + int schema_idx; + size_type nesting_depth; }; /** @@ -1523,8 +1523,8 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num // compute output column sizes by examining the pages of the -input- columns if (has_lists) { - std::vector h_cols_info; - h_cols_info.reserve(_input_columns.size()); + auto h_cols_info = + cudf::detail::make_empty_host_vector(_input_columns.size(), _stream); std::transform(_input_columns.cbegin(), _input_columns.cend(), std::back_inserter(h_cols_info), diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 8413e716224..2df71b77301 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1824,7 +1824,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, size_type max_page_fragment_size = max_page_fragment_size_opt.value_or(default_max_page_fragment_size); - std::vector column_frag_size(num_columns, max_page_fragment_size); + auto column_frag_size = cudf::detail::make_host_vector(num_columns, stream); + std::fill(column_frag_size.begin(), column_frag_size.end(), max_page_fragment_size); if (input.num_rows() > 0 && not max_page_fragment_size_opt.has_value()) { std::vector column_sizes; @@ -1880,7 +1881,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, size_type num_fragments = std::reduce(num_frag_in_part.begin(), num_frag_in_part.end()); - std::vector part_frag_offset; // Store the idx of the first fragment in each partition + auto part_frag_offset = + cudf::detail::make_empty_host_vector(num_frag_in_part.size() + 1, stream); + // Store the idx of the first fragment in each partition std::exclusive_scan( num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0); part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back()); diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu index 5625e1bf05c..50f40924478 100644 --- a/cpp/src/lists/dremel.cu +++ b/cpp/src/lists/dremel.cu @@ -257,10 +257,8 @@ dremel_data get_encoding(column_view h_col, }, stream); - thrust::host_vector column_offsets = - cudf::detail::make_host_vector_async(d_column_offsets, stream); - thrust::host_vector column_ends = - cudf::detail::make_host_vector_async(d_column_ends, stream); + auto column_offsets = cudf::detail::make_host_vector_async(d_column_offsets, stream); + auto column_ends = cudf::detail::make_host_vector_async(d_column_ends, stream); stream.synchronize(); size_t max_vals_size = 0; diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index c4cc0dbe09d..b534e9b2e5b 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -169,8 +169,10 @@ std::unique_ptr join_strings(strings_column_view const& input, // build the offsets: single string output has offsets [0,chars-size] auto offsets_column = [&] { - auto offsets = cudf::detail::make_device_uvector_async( - std::vector({0, static_cast(chars.size())}), stream, mr); + auto h_offsets = cudf::detail::make_host_vector(2, stream); + h_offsets[0] = 0; + h_offsets[1] = chars.size(); + auto offsets = cudf::detail::make_device_uvector_async(h_offsets, stream, mr); return std::make_unique(std::move(offsets), rmm::device_buffer{}, 0); }(); diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index 2f4ebf97264..64a2107e17a 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -123,7 +123,7 @@ struct format_compiler { : format(fmt), d_items(0, stream) { specifiers.insert(extra_specifiers.begin(), extra_specifiers.end()); - std::vector items; + auto items = cudf::detail::make_empty_host_vector(format.length(), stream); auto str = format.data(); auto length = format.length(); while (length > 0) { diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 7622e39e735..352e0f9f41a 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -79,7 +79,7 @@ auto create_strings_device_views(host_span views, rmm::cuda_s // Compute the partition offsets and size of offset column // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type - auto input_offsets = std::vector(views.size() + 1); + auto input_offsets = cudf::detail::make_host_vector(views.size() + 1, stream); auto offset_it = std::next(input_offsets.begin()); thrust::transform( thrust::host, views.begin(), views.end(), offset_it, [](auto const& col) -> size_t { diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu index a34828fa97e..48620af8cad 100644 --- a/cpp/src/strings/filter_chars.cu +++ b/cpp/src/strings/filter_chars.cu @@ -129,7 +129,7 @@ std::unique_ptr filter_characters( // convert input table for copy to device memory size_type table_size = static_cast(characters_to_filter.size()); - thrust::host_vector htable(table_size); + auto htable = cudf::detail::make_host_vector(table_size, stream); std::transform( characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) { return char_range{entry.first, entry.second}; diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index cd60a4296b9..31234ea42ec 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -171,7 +171,7 @@ std::unique_ptr replace_re(strings_column_view const& input, auto d_buffer = rmm::device_buffer(buffer_size, stream); // copy all the reprog_device instances to a device memory array - std::vector progs; + auto progs = cudf::detail::make_empty_host_vector(h_progs.size(), stream); std::transform(h_progs.begin(), h_progs.end(), std::back_inserter(progs), diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index 16b22d0de4c..a242b008a54 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -97,7 +97,7 @@ std::unique_ptr translate(strings_column_view const& strings, size_type table_size = static_cast(chars_table.size()); // convert input table - thrust::host_vector htable(table_size); + auto htable = cudf::detail::make_host_vector(table_size, stream); std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) { return translate_table{entry.first, entry.second}; }); diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 13c31e8ae4c..2969557c78f 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -308,7 +308,10 @@ auto decompose_structs(table_view table, auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream) { std::vector dremel_data; - std::vector dremel_device_views; + auto const num_list_columns = std::count_if( + table.begin(), table.end(), [](auto const& col) { return col.type().id() == type_id::LIST; }); + auto dremel_device_views = + cudf::detail::make_empty_host_vector(num_list_columns, stream); for (auto const& col : table) { if (col.type().id() == type_id::LIST) { dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream)); diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index 3d0822d8545..0efb881eb3e 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -14,6 +14,9 @@ * limitations under the License. */ +#include "cudf/detail/utilities/integer_utils.hpp" + +#include #include #include #include @@ -26,15 +29,24 @@ namespace cudf::detail { namespace { +// Simple kernel to copy between device buffers +CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx < n) { dst[idx] = src[idx]; } +} + void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream) { if (size == 0) return; if (size < get_kernel_pinned_copy_threshold()) { - thrust::copy_n(rmm::exec_policy_nosync(stream), - static_cast(src), - size, - static_cast(dst)); + const int block_size = 256; + auto const grid_size = cudf::util::div_rounding_up_safe(size, block_size); + // We are explicitly launching the kernel here instead of calling a thrust function because the + // thrust function can potentially call cudaMemcpyAsync instead of using a kernel + copy_kernel<<>>( + static_cast(src), static_cast(dst), size); } else { CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream)); } diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/host_memory.cpp similarity index 73% rename from cpp/src/utilities/pinned_memory.cpp rename to cpp/src/utilities/host_memory.cpp index 3ea4293fc60..7c3cea42023 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -83,7 +83,7 @@ class fixed_pinned_pool_memory_resource { void deallocate_async(void* ptr, std::size_t bytes, std::size_t alignment, - cuda::stream_ref stream) noexcept + cuda::stream_ref stream) { if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) { pool_->deallocate_async(ptr, bytes, alignment, stream); @@ -92,14 +92,14 @@ class fixed_pinned_pool_memory_resource { } } - void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept + void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) { return deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); } void deallocate(void* ptr, std::size_t bytes, - std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept + std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) { deallocate_async(ptr, bytes, alignment, stream_); stream_.wait(); @@ -186,6 +186,61 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr() return mr_ref; } +class new_delete_memory_resource { + public: + void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) + { + try { + return rmm::detail::aligned_host_allocate( + bytes, alignment, [](std::size_t size) { return ::operator new(size); }); + } catch (std::bad_alloc const& e) { + CUDF_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory); + } + } + + void* allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream) + { + return allocate(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT); + } + + void* allocate_async(std::size_t bytes, + std::size_t alignment, + [[maybe_unused]] cuda::stream_ref stream) + { + return allocate(bytes, alignment); + } + + void deallocate(void* ptr, + std::size_t bytes, + std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) + { + rmm::detail::aligned_host_deallocate( + ptr, bytes, alignment, [](void* ptr) { ::operator delete(ptr); }); + } + + void deallocate_async(void* ptr, + std::size_t bytes, + std::size_t alignment, + [[maybe_unused]] cuda::stream_ref stream) + { + deallocate(ptr, bytes, alignment); + } + + void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) + { + deallocate(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT); + } + + bool operator==(new_delete_memory_resource const& other) const { return true; } + + bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); } + + friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {} +}; + +static_assert(cuda::mr::resource_with, + "Pageable pool mr must be accessible from the host"); + } // namespace rmm::host_device_async_resource_ref set_pinned_memory_resource( @@ -225,4 +280,29 @@ void set_kernel_pinned_copy_threshold(size_t threshold) size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); } +CUDF_EXPORT auto& allocate_host_as_pinned_threshold() +{ + // use pageable memory for all host allocations + static std::atomic threshold = 0; + return threshold; +} + +void set_allocate_host_as_pinned_threshold(size_t threshold) +{ + allocate_host_as_pinned_threshold() = threshold; +} + +size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_threshold(); } + +namespace detail { + +CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource() +{ + static new_delete_memory_resource mr{}; + static rmm::host_async_resource_ref mr_ref{mr}; + return mr_ref; +} + +} // namespace detail + } // namespace cudf diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp index 7a72b77e1fb..8bcd5790e99 100644 --- a/cpp/tests/io/json/json_tree.cpp +++ b/cpp/tests/io/json/json_tree.cpp @@ -235,10 +235,8 @@ tree_meta_t2 get_tree_representation_cpu( { constexpr bool include_quote_char = true; // Copy the JSON tokens to the host - thrust::host_vector tokens = - cudf::detail::make_host_vector_async(tokens_gpu, stream); - thrust::host_vector token_indices = - cudf::detail::make_host_vector_async(token_indices_gpu1, stream); + auto tokens = cudf::detail::make_host_vector_async(tokens_gpu, stream); + auto token_indices = cudf::detail::make_host_vector_async(token_indices_gpu1, stream); // Make sure tokens have been copied to the host stream.synchronize(); diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp index 51e9b3bd0a0..7a038fa6d75 100644 --- a/cpp/tests/strings/integers_tests.cpp +++ b/cpp/tests/strings/integers_tests.cpp @@ -294,7 +294,7 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger) std::iota(h_integers.begin(), h_integers.end(), -(TypeParam)(h_integers.size() / 2)); h_integers.push_back(std::numeric_limits::min()); h_integers.push_back(std::numeric_limits::max()); - auto d_integers = cudf::detail::make_device_uvector_sync( + auto const d_integers = cudf::detail::make_device_uvector_sync( h_integers, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto integers = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, (cudf::size_type)d_integers.size()); @@ -308,8 +308,6 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger) // convert to strings auto results_strings = cudf::strings::from_integers(integers->view()); - // copy back to host - h_integers = cudf::detail::make_host_vector_sync(d_integers, cudf::get_default_stream()); std::vector h_strings; for (auto itr = h_integers.begin(); itr != h_integers.end(); ++itr) h_strings.push_back(std::to_string(*itr)); diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp index df9103640f4..93259fd63ee 100644 --- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp +++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp @@ -18,16 +18,33 @@ #include #include +#include #include +#include #include #include #include #include -class PinnedMemoryTest : public cudf::test::BaseFixture {}; +class PinnedMemoryTest : public cudf::test::BaseFixture { + size_t prev_copy_threshold; + size_t prev_alloc_threshold; -TEST(PinnedMemoryTest, MemoryResourceGetAndSet) + public: + PinnedMemoryTest() + : prev_copy_threshold{cudf::get_kernel_pinned_copy_threshold()}, + prev_alloc_threshold{cudf::get_allocate_host_as_pinned_threshold()} + { + } + ~PinnedMemoryTest() override + { + cudf::set_kernel_pinned_copy_threshold(prev_copy_threshold); + cudf::set_allocate_host_as_pinned_threshold(prev_alloc_threshold); + } +}; + +TEST_F(PinnedMemoryTest, MemoryResourceGetAndSet) { // Global environment for temporary files auto const temp_env = static_cast( @@ -63,3 +80,49 @@ TEST(PinnedMemoryTest, MemoryResourceGetAndSet) // reset memory resource back cudf::set_pinned_memory_resource(last_mr); } + +TEST_F(PinnedMemoryTest, KernelCopyThresholdGetAndSet) +{ + cudf::set_kernel_pinned_copy_threshold(12345); + EXPECT_EQ(cudf::get_kernel_pinned_copy_threshold(), 12345); +} + +TEST_F(PinnedMemoryTest, HostAsPinnedThresholdGetAndSet) +{ + cudf::set_allocate_host_as_pinned_threshold(12345); + EXPECT_EQ(cudf::get_allocate_host_as_pinned_threshold(), 12345); +} + +TEST_F(PinnedMemoryTest, MakePinnedVector) +{ + cudf::set_allocate_host_as_pinned_threshold(0); + + // should always use pinned memory + { + auto const vec = cudf::detail::make_pinned_vector_async(1, cudf::get_default_stream()); + EXPECT_TRUE(vec.get_allocator().is_device_accessible()); + } +} + +TEST_F(PinnedMemoryTest, MakeHostVector) +{ + cudf::set_allocate_host_as_pinned_threshold(7); + + // allocate smaller than the threshold + { + auto const vec = cudf::detail::make_host_vector(1, cudf::get_default_stream()); + EXPECT_TRUE(vec.get_allocator().is_device_accessible()); + } + + // allocate the same size as the threshold + { + auto const vec = cudf::detail::make_host_vector(7, cudf::get_default_stream()); + EXPECT_TRUE(vec.get_allocator().is_device_accessible()); + } + + // allocate larger than the threshold + { + auto const vec = cudf::detail::make_host_vector(2, cudf::get_default_stream()); + EXPECT_FALSE(vec.get_allocator().is_device_accessible()); + } +} From 75289c58f3d9ca11a51396e4adadfbd5f51856f5 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 23 Jul 2024 23:45:59 -0500 Subject: [PATCH 08/15] Rename PrefetchConfig to prefetch_config. (#16358) This PR addresses a comment requesting a rename of `PrefetchConfig` to `prefetch_config`. See: https://github.com/rapidsai/cudf/pull/16020#discussion_r1686284151 Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Shruti Shivakumar (https://github.com/shrshi) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/16358 --- cpp/include/cudf/utilities/prefetch.hpp | 10 +++++----- cpp/src/column/column_view.cpp | 2 +- cpp/src/utilities/prefetch.cpp | 21 ++++++++++++--------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/cpp/include/cudf/utilities/prefetch.hpp b/cpp/include/cudf/utilities/prefetch.hpp index 5ca6fd6f4b0..88c634a7cc7 100644 --- a/cpp/include/cudf/utilities/prefetch.hpp +++ b/cpp/include/cudf/utilities/prefetch.hpp @@ -31,17 +31,17 @@ namespace detail { /** * @brief A singleton class that manages the prefetching configuration. */ -class PrefetchConfig { +class prefetch_config { public: - PrefetchConfig& operator=(const PrefetchConfig&) = delete; - PrefetchConfig(const PrefetchConfig&) = delete; + prefetch_config& operator=(const prefetch_config&) = delete; + prefetch_config(const prefetch_config&) = delete; /** * @brief Get the singleton instance of the prefetching configuration. * * @return The singleton instance of the prefetching configuration. */ - static PrefetchConfig& instance(); + static prefetch_config& instance(); /** * @brief Get the value of a configuration key. @@ -65,7 +65,7 @@ class PrefetchConfig { bool debug{false}; private: - PrefetchConfig() = default; //< Private constructor to enforce singleton pattern + prefetch_config() = default; //< Private constructor to enforce singleton pattern std::map config_values; //< Map of configuration keys to values }; diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index a9605efb362..b0f9e9f0e74 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -39,7 +39,7 @@ namespace { template void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view key) noexcept { - if (cudf::experimental::prefetch::detail::PrefetchConfig::instance().get(key)) { + if (cudf::experimental::prefetch::detail::prefetch_config::instance().get(key)) { if (cudf::is_fixed_width(col.type())) { cudf::experimental::prefetch::detail::prefetch_noexcept( key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream()); diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp index 21f2e40c82a..16f2c3a1202 100644 --- a/cpp/src/utilities/prefetch.cpp +++ b/cpp/src/utilities/prefetch.cpp @@ -26,13 +26,13 @@ namespace cudf::experimental::prefetch { namespace detail { -PrefetchConfig& PrefetchConfig::instance() +prefetch_config& prefetch_config::instance() { - static PrefetchConfig instance; + static prefetch_config instance; return instance; } -bool PrefetchConfig::get(std::string_view key) +bool prefetch_config::get(std::string_view key) { // Default to not prefetching if (config_values.find(key.data()) == config_values.end()) { @@ -40,7 +40,7 @@ bool PrefetchConfig::get(std::string_view key) } return config_values[key.data()]; } -void PrefetchConfig::set(std::string_view key, bool value) { config_values[key.data()] = value; } +void prefetch_config::set(std::string_view key, bool value) { config_values[key.data()] = value; } cudaError_t prefetch_noexcept(std::string_view key, void const* ptr, @@ -48,8 +48,8 @@ cudaError_t prefetch_noexcept(std::string_view key, rmm::cuda_stream_view stream, rmm::cuda_device_id device_id) noexcept { - if (PrefetchConfig::instance().get(key)) { - if (PrefetchConfig::instance().debug) { + if (prefetch_config::instance().get(key)) { + if (prefetch_config::instance().debug) { std::cerr << "Prefetching " << size << " bytes for key " << key << " at location " << ptr << std::endl; } @@ -78,12 +78,15 @@ void prefetch(std::string_view key, } // namespace detail -void enable_prefetching(std::string_view key) { detail::PrefetchConfig::instance().set(key, true); } +void enable_prefetching(std::string_view key) +{ + detail::prefetch_config::instance().set(key, true); +} void disable_prefetching(std::string_view key) { - detail::PrefetchConfig::instance().set(key, false); + detail::prefetch_config::instance().set(key, false); } -void prefetch_debugging(bool enable) { detail::PrefetchConfig::instance().debug = enable; } +void prefetch_debugging(bool enable) { detail::prefetch_config::instance().debug = enable; } } // namespace cudf::experimental::prefetch From 8c1749b40eaa983966ed3bece6bdd29a4316d18a Mon Sep 17 00:00:00 2001 From: Kyle Edwards Date: Wed, 24 Jul 2024 01:19:10 -0400 Subject: [PATCH 09/15] Use rapids_cpm_bs_thread_pool() (#16360) Authors: - Kyle Edwards (https://github.com/KyleFromNVIDIA) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16360 --- cpp/CMakeLists.txt | 2 +- cpp/cmake/thirdparty/get_thread_pool.cmake | 20 +++++++------------- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a2c2dd3af4c..b044545bb08 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -807,7 +807,7 @@ add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $ + PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $ PRIVATE $ cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio $ nanoarrow ) diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake index 264257c7199..235bf409058 100644 --- a/cpp/cmake/thirdparty/get_thread_pool.cmake +++ b/cpp/cmake/thirdparty/get_thread_pool.cmake @@ -12,20 +12,14 @@ # the License. # ============================================================================= -# This function finds rmm and sets any additional necessary environment variables. +# Need to call rapids_cpm_bs_thread_pool to get support for an installed version of thread-pool and +# to support installing it ourselves function(find_and_configure_thread_pool) - rapids_cpm_find( - BS_thread_pool 4.1.0 - CPM_ARGS - GIT_REPOSITORY https://github.com/bshoshany/thread-pool.git - GIT_TAG 097aa718f25d44315cadb80b407144ad455ee4f9 - GIT_SHALLOW TRUE - ) - if(NOT TARGET BS_thread_pool) - add_library(BS_thread_pool INTERFACE) - target_include_directories(BS_thread_pool INTERFACE ${BS_thread_pool_SOURCE_DIR}/include) - target_compile_definitions(BS_thread_pool INTERFACE "BS_THREAD_POOL_ENABLE_PAUSE=1") - endif() + include(${rapids-cmake-dir}/cpm/bs_thread_pool.cmake) + + # Find or install thread-pool + rapids_cpm_bs_thread_pool(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports) + endfunction() find_and_configure_thread_pool() From 62625f1bfcdb980186a1afbec41e420fdb4a7075 Mon Sep 17 00:00:00 2001 From: Matt Topol Date: Wed, 24 Jul 2024 03:42:03 -0400 Subject: [PATCH 10/15] Host implementation of `to_arrow` using nanoarrow (#16297) Adds the corresponding `to_arrow_host` functions for interop using `ArrowDeviceArray`. This includes updating the version of nanoarrow in use to pick up some bug fixes and features. Authors: - Matt Topol (https://github.com/zeroshade) - Muhammad Haseeb (https://github.com/mhaseeb123) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Muhammad Haseeb (https://github.com/mhaseeb123) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16297 --- cpp/CMakeLists.txt | 1 + cpp/cmake/thirdparty/get_nanoarrow.cmake | 4 +- cpp/include/cudf/interop.hpp | 80 +- cpp/include/cudf/interop/detail/arrow.hpp | 53 - cpp/src/interop/arrow_utilities.cpp | 31 + cpp/src/interop/arrow_utilities.hpp | 43 +- cpp/src/interop/from_arrow_device.cu | 10 +- cpp/src/interop/from_arrow_host.cu | 2 +- cpp/src/interop/to_arrow.cu | 33 +- cpp/src/interop/to_arrow_device.cu | 101 +- cpp/src/interop/to_arrow_host.cu | 428 ++++++++ cpp/src/interop/to_arrow_schema.cpp | 7 +- cpp/tests/CMakeLists.txt | 1 + cpp/tests/interop/nanoarrow_utils.hpp | 9 +- cpp/tests/interop/to_arrow_device_test.cpp | 1 - cpp/tests/interop/to_arrow_host_test.cpp | 1117 ++++++++++++++++++++ 16 files changed, 1760 insertions(+), 161 deletions(-) delete mode 100644 cpp/include/cudf/interop/detail/arrow.hpp create mode 100644 cpp/src/interop/to_arrow_host.cu create mode 100644 cpp/tests/interop/to_arrow_host_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index b044545bb08..24b683a930b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -367,6 +367,7 @@ add_library( src/interop/arrow_utilities.cpp src/interop/to_arrow.cu src/interop/to_arrow_device.cu + src/interop/to_arrow_host.cu src/interop/from_arrow_device.cu src/interop/from_arrow_host.cu src/interop/from_arrow_stream.cu diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake index 025bff7d8f0..8df1b431095 100644 --- a/cpp/cmake/thirdparty/get_nanoarrow.cmake +++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake @@ -17,11 +17,11 @@ function(find_and_configure_nanoarrow) # Currently we need to always build nanoarrow so we don't pickup a previous installed version set(CPM_DOWNLOAD_nanoarrow ON) rapids_cpm_find( - nanoarrow 0.5.0 + nanoarrow 0.6.0.dev GLOBAL_TARGETS nanoarrow CPM_ARGS GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git - GIT_TAG 11e73a8c85b45e3d49c8c541b4e1497a649fe03c + GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb GIT_SHALLOW FALSE OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 11f6ce2bad7..61f7d72a467 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -136,6 +136,8 @@ struct column_metadata { * Converts the `cudf::table_view` to `arrow::Table` with the provided * metadata `column_names`. * + * @deprecated Since 24.08. Use cudf::to_arrow_host instead. + * * @throws cudf::logic_error if `column_names` size doesn't match with number of columns. * * @param input table_view that needs to be converted to arrow Table @@ -150,16 +152,19 @@ struct column_metadata { * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be * converted to Arrow decimal128 of the precision 38. */ -std::shared_ptr to_arrow(table_view input, - std::vector const& metadata = {}, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); +[[deprecated]] std::shared_ptr to_arrow( + table_view input, + std::vector const& metadata = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); /** * @brief Create `arrow::Scalar` from cudf scalar `input` * * Converts the `cudf::scalar` to `arrow::Scalar`. * + * @deprecated Since 24.08. + * * @param input scalar that needs to be converted to arrow Scalar * @param metadata Contains hierarchy of names of columns and children * @param stream CUDA stream used for device memory operations and kernel launches @@ -172,10 +177,11 @@ std::shared_ptr to_arrow(table_view input, * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be * converted to Arrow decimal128 of the precision 38. */ -std::shared_ptr to_arrow(cudf::scalar const& input, - column_metadata const& metadata = {}, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); +[[deprecated]] std::shared_ptr to_arrow( + cudf::scalar const& input, + column_metadata const& metadata = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); /** * @brief typedef for a unique_ptr to an ArrowSchema with custom deleter @@ -329,15 +335,67 @@ unique_device_array_t to_arrow_device( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); +/** + * @brief Copy table view data to host and create `ArrowDeviceArray` for it + * + * Populates the C struct ArrowDeviceArray, copying the cudf data to the host. The + * returned ArrowDeviceArray will have a device_type of CPU and will have no ties + * to the memory referenced by the table view passed in. The deleter for the + * returned unique_ptr will call the release callback on the ArrowDeviceArray + * automatically. + * + * @note For decimals, since the precision is not stored for them in libcudf, it will + * be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type + * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision + * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be + * converted to Arrow decimal128 of precision 38. + * + * @param table Input table + * @param stream CUDA stream used for the device memory operations and kernel launches + * @param mr Device memory resource used for any allocations during conversion + * @return ArrowDeviceArray generated from input table + */ +unique_device_array_t to_arrow_host( + cudf::table_view const& table, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Copy column view data to host and create `ArrowDeviceArray` for it + * + * Populates the C struct ArrowDeviceArray, copying the cudf data to the host. The + * returned ArrowDeviceArray will have a device_type of CPU and will have no ties + * to the memory referenced by the column view passed in. The deleter for the + * returned unique_ptr will call the release callback on the ArrowDeviceArray + * automatically. + * + * @note For decimals, since the precision is not stored for them in libcudf, it will + * be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type + * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision + * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be + * converted to Arrow decimal128 of precision 38. + * + * @param col Input column + * @param stream CUDA stream used for the device memory operations and kernel launches + * @param mr Device memory resource used for any allocations during conversion + * @return ArrowDeviceArray generated from input column + */ +unique_device_array_t to_arrow_host( + cudf::column_view const& col, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + /** * @brief Create `cudf::table` from given arrow Table input * + * @deprecated Since 24.08. Use cudf::from_arrow_host instead. + * * @param input arrow:Table that needs to be converted to `cudf::table` * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate `cudf::table` * @return cudf table generated from given arrow Table */ -std::unique_ptr
from_arrow( +[[deprecated]] std::unique_ptr
from_arrow( arrow::Table const& input, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); @@ -345,12 +403,14 @@ std::unique_ptr
from_arrow( /** * @brief Create `cudf::scalar` from given arrow Scalar input * + * @deprecated Since 24.08. + * * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar` * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate `cudf::scalar` * @return cudf scalar generated from given arrow Scalar */ -std::unique_ptr from_arrow( +[[deprecated]] std::unique_ptr from_arrow( arrow::Scalar const& input, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/interop/detail/arrow.hpp b/cpp/include/cudf/interop/detail/arrow.hpp deleted file mode 100644 index 906d48f636b..00000000000 --- a/cpp/include/cudf/interop/detail/arrow.hpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -// from Arrow C Device Data Interface -// https://arrow.apache.org/docs/format/CDeviceDataInterface.html -#ifndef ARROW_C_DEVICE_DATA_INTERFACE -#define ARROW_C_DEVICE_DATA_INTERFACE - -// Device type for the allocated memory -using ArrowDeviceType = int32_t; - -// The Arrow spec specifies using macros rather than enums here to avoid being -// susceptible to changes in the underlying type chosen by the compiler, but -// clang-tidy doesn't like this. -// NOLINTBEGIN -// CPU device, same as using ArrowArray directly -#define ARROW_DEVICE_CPU 1 -// CUDA GPU Device -#define ARROW_DEVICE_CUDA 2 -// Pinned CUDA CPU memory by cudaMallocHost -#define ARROW_DEVICE_CUDA_HOST 3 -// CUDA managed/unified memory allocated by cudaMallocManaged -#define ARROW_DEVICE_CUDA_MANAGED 13 -// NOLINTEND - -struct ArrowDeviceArray { - struct ArrowArray array; - int64_t device_id; - ArrowDeviceType device_type; - void* sync_event; - - // reserved bytes for future expansion - int64_t reserved[3]; -}; - -#endif // ARROW_C_DEVICE_DATA_INTERFACE diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp index 605d813ed1e..4292552a800 100644 --- a/cpp/src/interop/arrow_utilities.cpp +++ b/cpp/src/interop/arrow_utilities.cpp @@ -16,9 +16,16 @@ #include "arrow_utilities.hpp" +#include #include #include +#include +#include + +#include +#include + #include namespace cudf { @@ -83,9 +90,33 @@ ArrowType id_to_arrow_type(cudf::type_id id) case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT; case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE; case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32; + case cudf::type_id::DECIMAL128: return NANOARROW_TYPE_DECIMAL128; default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error); } } +ArrowType id_to_arrow_storage_type(cudf::type_id id) +{ + switch (id) { + case cudf::type_id::TIMESTAMP_SECONDS: + case cudf::type_id::TIMESTAMP_MILLISECONDS: + case cudf::type_id::TIMESTAMP_MICROSECONDS: + case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64; + case cudf::type_id::DURATION_SECONDS: + case cudf::type_id::DURATION_MILLISECONDS: + case cudf::type_id::DURATION_MICROSECONDS: + case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64; + default: return id_to_arrow_type(id); + } +} + +int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column) +{ + NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type)); + arr->length = column.size(); + arr->null_count = column.null_count(); + return NANOARROW_OK; +} + } // namespace detail } // namespace cudf diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp index 4e2628ab689..1cee3071fcb 100644 --- a/cpp/src/interop/arrow_utilities.hpp +++ b/cpp/src/interop/arrow_utilities.hpp @@ -18,8 +18,12 @@ #include +#include +#include +#include +#include + #include -#include namespace cudf { namespace detail { @@ -47,5 +51,42 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view); */ ArrowType id_to_arrow_type(cudf::type_id id); +/** + * @brief Map cudf column type id to the storage type for Arrow + * + * Specifically this is for handling the underlying storage type of + * timestamps and durations. + * + * @param id column type id + * @return ArrowType storage type + */ +ArrowType id_to_arrow_storage_type(cudf::type_id id); + +/** + * @brief Helper to initialize ArrowArray struct + * + * @param arr Pointer to ArrowArray to initialize + * @param storage_type The type to initialize with + * @param column view for column to get the length and null count from + * @return nanoarrow status code, should be NANOARROW_OK if there are no errors + */ +int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column); + +/** + * @brief Helper to convert decimal values to 128-bit versions for Arrow compatibility + * + * The template parameter should be the underlying type of the data (e.g. int32_t for + * 32-bit decimal and int64_t for 64-bit decimal). + * + * @param input column_view of the data + * @param stream cuda stream to perform the operations on + * @param mr memory resource to allocate the returned device_uvector with + * @return unique_ptr to a device_buffer containing the upcasted data + */ +template +std::unique_ptr decimals_to_arrow(cudf::column_view input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + } // namespace detail } // namespace cudf diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index e1d289e67a3..440df571de0 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -39,6 +38,7 @@ #include #include +#include namespace cudf { @@ -144,9 +144,6 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING, - "Large strings are not yet supported in from_arrow_device", - cudf::data_type_error); if (input->length == 0) { return std::make_tuple( {type, @@ -158,12 +155,15 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( {}); } - auto offsets_view = column_view{data_type(type_id::INT32), + data_type offsets_type(type_id::INT32); + if (schema->type == NANOARROW_TYPE_LARGE_STRING) { offsets_type = data_type(type_id::INT64); } + auto offsets_view = column_view{offsets_type, static_cast(input->offset + input->length) + 1, input->buffers[fixed_width_data_buffer_idx], nullptr, 0, 0}; + return std::make_tuple( {type, static_cast(input->length), diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index b3087dedf98..efde8f2a463 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -42,6 +41,7 @@ #include #include +#include namespace cudf { namespace detail { diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 622a3aba4bb..e89ecedc218 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "arrow_utilities.hpp" #include "detail/arrow_allocator.hpp" #include @@ -157,33 +158,17 @@ std::shared_ptr unsupported_decimals_to_arrow(column_view input, arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType); - - rmm::device_uvector buf(input.size() * BIT_WIDTH_RATIO, stream); - - auto count = thrust::make_counting_iterator(0); - - thrust::for_each( - rmm::exec_policy(cudf::get_default_stream()), - count, - count + input.size(), - [in = input.begin(), out = buf.data(), BIT_WIDTH_RATIO] __device__(auto in_idx) { - auto const out_idx = in_idx * BIT_WIDTH_RATIO; - // The lowest order bits are the value, the remainder - // simply matches the sign bit to satisfy the two's - // complement integer representation of negative numbers. - out[out_idx] = in[in_idx]; -#pragma unroll BIT_WIDTH_RATIO - 1 - for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { - out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; - } - }); + auto buf = + detail::decimals_to_arrow(input, stream, rmm::mr::get_current_device_resource()); - auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType); + auto const buf_size_in_bytes = buf->size(); auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); - CUDF_CUDA_TRY(cudaMemcpyAsync( - data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value())); + CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(), + buf->data(), + buf_size_in_bytes, + cudaMemcpyDefault, + stream.value())); auto type = arrow::decimal(precision, -input.type().scale()); auto mask = fetch_mask_buffer(input, ar_mr, stream); diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu index b9d3a59e647..2eb9b912054 100644 --- a/cpp/src/interop/to_arrow_device.cu +++ b/cpp/src/interop/to_arrow_device.cu @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -44,6 +43,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -56,14 +56,6 @@ void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t) delete unique_buffer; } -int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column) -{ - NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type)); - arr->length = column.size(); - arr->null_count = column.null_count(); - return NANOARROW_OK; -} - template struct is_device_scalar : public std::false_type {}; @@ -99,21 +91,6 @@ int set_buffer(std::unique_ptr device_buf, int64_t i, ArrowArray* out) return NANOARROW_OK; } -ArrowType id_to_arrow_storage_type(cudf::type_id id) -{ - switch (id) { - case cudf::type_id::TIMESTAMP_SECONDS: - case cudf::type_id::TIMESTAMP_MILLISECONDS: - case cudf::type_id::TIMESTAMP_MICROSECONDS: - case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64; - case cudf::type_id::DURATION_SECONDS: - case cudf::type_id::DURATION_MILLISECONDS: - case cudf::type_id::DURATION_MICROSECONDS: - case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64; - default: return id_to_arrow_type(id); - } -} - struct dispatch_to_arrow_device { template ())> int operator()(cudf::column&&, rmm::cuda_stream_view, rmm::device_async_resource_ref, ArrowArray*) @@ -156,35 +133,15 @@ struct dispatch_to_arrow_device { }; template -int decimals_to_arrow(cudf::column_view input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr, - ArrowArray* out) +int construct_decimals(cudf::column_view input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr, + ArrowArray* out) { nanoarrow::UniqueArray tmp; NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input)); - constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType); - auto buf = - std::make_unique>(input.size() * BIT_WIDTH_RATIO, stream, mr); - - auto count = thrust::counting_iterator(0); - - thrust::for_each( - rmm::exec_policy(stream, mr), - count, - count + input.size(), - [in = input.begin(), out = buf->data(), BIT_WIDTH_RATIO] __device__(auto in_idx) { - auto const out_idx = in_idx * BIT_WIDTH_RATIO; - // the lowest order bits are the value, the remainder - // simply matches the sign bit to satisfy the two's - // complement integer representation of negative numbers. - out[out_idx] = in[in_idx]; -#pragma unroll BIT_WIDTH_RATIO - 1 - for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { - out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; - } - }); + auto buf = detail::decimals_to_arrow(input, stream, mr); NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get())); ArrowArrayMove(tmp.get(), out); @@ -198,7 +155,7 @@ int dispatch_to_arrow_device::operator()(cudf::column&& colu ArrowArray* out) { using DeviceType = int32_t; - NANOARROW_RETURN_NOT_OK(decimals_to_arrow(column.view(), stream, mr, out)); + NANOARROW_RETURN_NOT_OK(construct_decimals(column.view(), stream, mr, out)); auto contents = column.release(); NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out)); return NANOARROW_OK; @@ -211,7 +168,7 @@ int dispatch_to_arrow_device::operator()(cudf::column&& colu ArrowArray* out) { using DeviceType = int64_t; - NANOARROW_RETURN_NOT_OK(decimals_to_arrow(column.view(), stream, mr, out)); + NANOARROW_RETURN_NOT_OK(construct_decimals(column.view(), stream, mr, out)); auto contents = column.release(); NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out)); return NANOARROW_OK; @@ -256,8 +213,15 @@ int dispatch_to_arrow_device::operator()(cudf::column&& colum rmm::device_async_resource_ref mr, ArrowArray* out) { + ArrowType nanoarrow_type = NANOARROW_TYPE_STRING; + if (column.num_children() > 0 && + column.child(cudf::strings_column_view::offsets_column_index).type().id() == + cudf::type_id::INT64) { + nanoarrow_type = NANOARROW_TYPE_LARGE_STRING; + } + nanoarrow::UniqueArray tmp; - NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column)); + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), nanoarrow_type, column)); if (column.size() == 0) { // the scalar zero here is necessary because the spec for string arrays states @@ -265,8 +229,14 @@ int dispatch_to_arrow_device::operator()(cudf::column&& colum // the case of a 0 length string array, there should be exactly 1 value, zero, // in the offsets buffer. While some arrow implementations may accept a zero-sized // offsets buffer, best practices would be to allocate the buffer with the single value. - auto zero = std::make_unique>(0, stream, mr); - NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); + if (nanoarrow_type == NANOARROW_TYPE_STRING) { + auto zero = std::make_unique>(0, stream, mr); + NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); + } else { + auto zero = std::make_unique>(0, stream, mr); + NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); + } + ArrowArrayMove(tmp.get(), out); return NANOARROW_OK; } @@ -436,7 +406,7 @@ template <> int dispatch_to_arrow_device_view::operator()(ArrowArray* out) const { using DeviceType = int32_t; - NANOARROW_RETURN_NOT_OK(decimals_to_arrow(column, stream, mr, out)); + NANOARROW_RETURN_NOT_OK(construct_decimals(column, stream, mr, out)); NANOARROW_RETURN_NOT_OK(set_null_mask(column, out)); return NANOARROW_OK; } @@ -445,7 +415,7 @@ template <> int dispatch_to_arrow_device_view::operator()(ArrowArray* out) const { using DeviceType = int64_t; - NANOARROW_RETURN_NOT_OK(decimals_to_arrow(column, stream, mr, out)); + NANOARROW_RETURN_NOT_OK(construct_decimals(column, stream, mr, out)); NANOARROW_RETURN_NOT_OK(set_null_mask(column, out)); return NANOARROW_OK; } @@ -481,13 +451,26 @@ int dispatch_to_arrow_device_view::operator()(ArrowArray* out) const template <> int dispatch_to_arrow_device_view::operator()(ArrowArray* out) const { + ArrowType nanoarrow_type = NANOARROW_TYPE_STRING; + if (column.num_children() > 0 && + column.child(cudf::strings_column_view::offsets_column_index).type().id() == + cudf::type_id::INT64) { + nanoarrow_type = NANOARROW_TYPE_LARGE_STRING; + } + nanoarrow::UniqueArray tmp; - NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column)); + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), nanoarrow_type, column)); if (column.size() == 0) { // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552 - auto zero = std::make_unique>(0, stream, mr); - NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); + if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) { + auto zero = std::make_unique>(0, stream, mr); + NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); + } else { + auto zero = std::make_unique>(0, stream, mr); + NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get())); + } + ArrowArrayMove(tmp.get(), out); return NANOARROW_OK; } diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu new file mode 100644 index 00000000000..c9e53ebaab7 --- /dev/null +++ b/cpp/src/interop/to_arrow_host.cu @@ -0,0 +1,428 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arrow_utilities.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include + +namespace cudf { +namespace detail { + +template +std::unique_ptr decimals_to_arrow(cudf::column_view input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType); + auto buf = std::make_unique(input.size() * sizeof(__int128_t), stream, mr); + + auto count = thrust::counting_iterator(0); + thrust::for_each(rmm::exec_policy(stream, mr), + count, + count + input.size(), + [in = input.begin(), + out = reinterpret_cast(buf->data()), + BIT_WIDTH_RATIO] __device__(auto in_idx) { + auto const out_idx = in_idx * BIT_WIDTH_RATIO; + // the lowest order bits are the value, the remainder + // simply matches the sign bit to satisfy the two's + // complement integer representation of negative numbers. + out[out_idx] = in[in_idx]; +#pragma unroll BIT_WIDTH_RATIO - 1 + for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { + out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; + } + }); + + return buf; +} + +template std::unique_ptr decimals_to_arrow( + cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + +template std::unique_ptr decimals_to_arrow( + cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + +namespace { + +struct dispatch_to_arrow_host { + cudf::column_view column; + rmm::cuda_stream_view stream; + rmm::device_async_resource_ref mr; + + int populate_validity_bitmap(ArrowBitmap* bitmap) const + { + if (!column.has_nulls()) { return NANOARROW_OK; } + + NANOARROW_RETURN_NOT_OK(ArrowBitmapResize(bitmap, static_cast(column.size()), 0)); + CUDF_CUDA_TRY(cudaMemcpyAsync(bitmap->buffer.data, + (column.offset() > 0) + ? cudf::detail::copy_bitmask(column, stream, mr).data() + : column.null_mask(), + bitmap->buffer.size_bytes, + cudaMemcpyDefault, + stream.value())); + return NANOARROW_OK; + } + + template + int populate_data_buffer(device_span input, ArrowBuffer* buffer) const + { + NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, input.size_bytes(), 1)); + CUDF_CUDA_TRY(cudaMemcpyAsync( + buffer->data, input.data(), input.size_bytes(), cudaMemcpyDefault, stream.value())); + return NANOARROW_OK; + } + + template () && !cudf::is_fixed_point())> + int operator()(ArrowArray*) const + { + CUDF_FAIL("Unsupported type for to_arrow_host", cudf::data_type_error); + } + + template () || std::is_same_v)> + int operator()(ArrowArray* out) const + { + nanoarrow::UniqueArray tmp; + + auto const storage_type = id_to_arrow_storage_type(column.type().id()); + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column)); + + NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get()))); + using DataType = std::conditional_t, __int128_t, T>; + NANOARROW_RETURN_NOT_OK( + populate_data_buffer(device_span(column.data(), column.size()), + ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx))); + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; + } + + // convert decimal types from libcudf to arrow where those types are not directly + // supported by Arrow. These types must be fit into 128 bits, the smallest + // decimal resolution supported by Arrow + template () && + (std::is_same_v || + std::is_same_v))> + int operator()(ArrowArray* out) const + { + using DeviceType = std::conditional_t, int32_t, int64_t>; + nanoarrow::UniqueArray tmp; + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column)); + + NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get()))); + auto buf = detail::decimals_to_arrow(column, stream, mr); + NANOARROW_RETURN_NOT_OK( + populate_data_buffer(device_span<__int128_t const>( + reinterpret_cast(buf->data()), column.size()), + ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx))); + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; + } +}; + +int get_column(cudf::column_view column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr, + ArrowArray* out); + +template <> +int dispatch_to_arrow_host::operator()(ArrowArray* out) const +{ + nanoarrow::UniqueArray tmp; + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column)); + + NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get()))); + auto bitmask = bools_to_mask(column, stream, mr); + NANOARROW_RETURN_NOT_OK(populate_data_buffer( + device_span(reinterpret_cast(bitmask.first->data()), + bitmask.first->size()), + ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx))); + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; +} + +template <> +int dispatch_to_arrow_host::operator()(ArrowArray* out) const +{ + ArrowType nanoarrow_type = NANOARROW_TYPE_STRING; + if (column.num_children() > 0 && + column.child(cudf::strings_column_view::offsets_column_index).type().id() == + cudf::type_id::INT64) { + nanoarrow_type = NANOARROW_TYPE_LARGE_STRING; + } + + nanoarrow::UniqueArray tmp; + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), nanoarrow_type, column)); + + if (column.size() == 0) { + // initialize the offset buffer with a single zero by convention + if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt64(ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx), 0)); + } else { + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx), 0)); + } + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; + } + + NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get()))); + + auto const scv = cudf::strings_column_view(column); + auto const offsets = scv.offsets(); + if (offsets.type().id() == cudf::type_id::INT64) { + NANOARROW_RETURN_NOT_OK(populate_data_buffer( + device_span(offsets.data() + scv.offset(), scv.size() + 1), + ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx))); + } else { + NANOARROW_RETURN_NOT_OK(populate_data_buffer( + device_span(offsets.data() + scv.offset(), scv.size() + 1), + ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx))); + } + + NANOARROW_RETURN_NOT_OK( + populate_data_buffer(device_span(scv.chars_begin(stream), scv.chars_size(stream)), + ArrowArrayBuffer(tmp.get(), 2))); + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; +} + +template <> +int dispatch_to_arrow_host::operator()(ArrowArray* out) const +{ + nanoarrow::UniqueArray tmp; + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_LIST, column)); + NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1)); + + NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get()))); + auto const lcv = cudf::lists_column_view(column); + + if (column.size() == 0) { + // initialize the offsets buffer with a single zero by convention for 0 length + NANOARROW_RETURN_NOT_OK( + ArrowBufferAppendInt32(ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx), 0)); + } else { + NANOARROW_RETURN_NOT_OK( + populate_data_buffer(device_span(lcv.offsets_begin(), (column.size() + 1)), + ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx))); + } + + NANOARROW_RETURN_NOT_OK(get_column(lcv.child(), stream, mr, tmp->children[0])); + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; +} + +template <> +int dispatch_to_arrow_host::operator()(ArrowArray* out) const +{ + nanoarrow::UniqueArray tmp; + NANOARROW_RETURN_NOT_OK(initialize_array( + tmp.get(), + id_to_arrow_type(column.child(cudf::dictionary_column_view::indices_column_index).type().id()), + column)); + NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get())); + + NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get()))); + auto dcv = cudf::dictionary_column_view(column); + auto dict_indices = dcv.get_indices_annotated(); + switch (dict_indices.type().id()) { + case type_id::INT8: + case type_id::UINT8: + NANOARROW_RETURN_NOT_OK(populate_data_buffer( + device_span(dict_indices.data(), dict_indices.size()), + ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx))); + break; + case type_id::INT16: + case type_id::UINT16: + NANOARROW_RETURN_NOT_OK(populate_data_buffer( + device_span(dict_indices.data(), dict_indices.size()), + ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx))); + break; + case type_id::INT32: + case type_id::UINT32: + NANOARROW_RETURN_NOT_OK(populate_data_buffer( + device_span(dict_indices.data(), dict_indices.size()), + ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx))); + break; + case type_id::INT64: + case type_id::UINT64: + NANOARROW_RETURN_NOT_OK(populate_data_buffer( + device_span(dict_indices.data(), dict_indices.size()), + ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx))); + break; + default: CUDF_FAIL("unsupported type for dictionary indices"); + } + + NANOARROW_RETURN_NOT_OK(get_column(dcv.keys(), stream, mr, tmp->dictionary)); + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; +} + +template <> +int dispatch_to_arrow_host::operator()(ArrowArray* out) const +{ + nanoarrow::UniqueArray tmp; + + NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRUCT, column)); + NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children())); + NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get()))); + + auto const scv = cudf::structs_column_view(column); + + for (size_t i = 0; i < size_t(tmp->n_children); ++i) { + ArrowArray* child_ptr = tmp->children[i]; + auto const child = scv.get_sliced_child(i, stream); + NANOARROW_RETURN_NOT_OK(get_column(child, stream, mr, child_ptr)); + } + + ArrowArrayMove(tmp.get(), out); + return NANOARROW_OK; +} + +int get_column(cudf::column_view column, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr, + ArrowArray* out) +{ + return column.type().id() != type_id::EMPTY + ? type_dispatcher(column.type(), dispatch_to_arrow_host{column, stream, mr}, out) + : initialize_array(out, NANOARROW_TYPE_NA, column); +} + +unique_device_array_t create_device_array(nanoarrow::UniqueArray&& out) +{ + ArrowError err; + if (ArrowArrayFinishBuildingDefault(out.get(), &err) != NANOARROW_OK) { + std::cerr << err.message << std::endl; + CUDF_FAIL("failed to build"); + } + + unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) { + if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); } + delete arr; + }); + + result->device_id = -1; + result->device_type = ARROW_DEVICE_CPU; + result->sync_event = nullptr; + ArrowArrayMove(out.get(), &result->array); + return result; +} + +} // namespace + +unique_device_array_t to_arrow_host(cudf::table_view const& table, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + nanoarrow::UniqueArray tmp; + NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT)); + + NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), table.num_columns())); + tmp->length = table.num_rows(); + tmp->null_count = 0; + + for (cudf::size_type i = 0; i < table.num_columns(); ++i) { + auto child = tmp->children[i]; + auto col = table.column(i); + NANOARROW_THROW_NOT_OK( + cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_host{col, stream, mr}, child)); + } + + // wait for all the stream operations to complete before we return. + // this ensures that the host memory that we're returning will be populated + // before we return from this function. + stream.synchronize(); + + return create_device_array(std::move(tmp)); +} + +unique_device_array_t to_arrow_host(cudf::column_view const& col, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + nanoarrow::UniqueArray tmp; + + NANOARROW_THROW_NOT_OK( + cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_host{col, stream, mr}, tmp.get())); + + // wait for all the stream operations to complete before we return. + // this ensures that the host memory that we're returning will be populated + // before we return from this function. + stream.synchronize(); + + return create_device_array(std::move(tmp)); +} + +} // namespace detail + +unique_device_array_t to_arrow_host(cudf::column_view const& col, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::to_arrow_host(col, stream, mr); +} + +unique_device_array_t to_arrow_host(cudf::table_view const& table, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::to_arrow_host(table, stream, mr); +} + +} // namespace cudf diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp index 19915464236..b98ca8a7bed 100644 --- a/cpp/src/interop/to_arrow_schema.cpp +++ b/cpp/src/interop/to_arrow_schema.cpp @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -120,7 +119,11 @@ int dispatch_to_arrow_type::operator()(column_view input, column_metadata const&, ArrowSchema* out) { - return ArrowSchemaSetType(out, NANOARROW_TYPE_STRING); + return ((input.num_children() == 0 || + input.child(cudf::strings_column_view::offsets_column_index).type().id() == + type_id::INT32)) + ? ArrowSchemaSetType(out, NANOARROW_TYPE_STRING) + : ArrowSchemaSetType(out, NANOARROW_TYPE_LARGE_STRING); } // these forward declarations are needed due to the recursive calls to them diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 05e9759632f..88187623930 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -270,6 +270,7 @@ ConfigureTest( INTEROP_TEST interop/to_arrow_device_test.cpp interop/to_arrow_test.cpp + interop/to_arrow_host_test.cpp interop/from_arrow_test.cpp interop/from_arrow_device_test.cpp interop/from_arrow_host_test.cpp diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index 4147728b2a6..a961f73d955 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -29,6 +28,7 @@ #include #include +#include struct generated_test_data { generated_test_data(cudf::size_type length) @@ -211,6 +211,7 @@ DEFINE_NANOARROW_STORAGE(cudf::duration_us, INT64); DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64); DEFINE_NANOARROW_STORAGE(uint8_t, UINT8); DEFINE_NANOARROW_STORAGE(int32_t, INT32); +DEFINE_NANOARROW_STORAGE(__int128_t, DECIMAL128); #undef DEFINE_NANOARROW_STORAGE @@ -255,8 +256,7 @@ std::enable_if_t, nanoarrow::UniqueArray> get_nanoarrow_ ArrowBitmap out; ArrowBitmapInit(&out); NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1)); - out.buffer.size_bytes = (b.size() >> 3) + ((b.size() & 7) != 0); - out.size_bits = b.size(); + std::memset(out.buffer.data, 0, out.buffer.size_bytes); for (size_t i = 0; i < b.size(); ++i) { ArrowBitSetTo(out.buffer.data, i, static_cast(b[i])); @@ -296,6 +296,7 @@ std::enable_if_t, nanoarrow::UniqueArray> g { nanoarrow::UniqueArray tmp; NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(tmp.get()), mask.size())); NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(tmp.get())); NANOARROW_THROW_NOT_OK(ArrowArrayReserve(tmp.get(), data.size())); @@ -378,3 +379,5 @@ get_nanoarrow_cudf_table(cudf::size_type length); std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> get_nanoarrow_host_tables(cudf::size_type length); + +void slice_host_nanoarrow(ArrowArray* arr, int64_t start, int64_t end); diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp index 8903f09b82b..77da4039103 100644 --- a/cpp/tests/interop/to_arrow_device_test.cpp +++ b/cpp/tests/interop/to_arrow_device_test.cpp @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp new file mode 100644 index 00000000000..fc0ed6c9352 --- /dev/null +++ b/cpp/tests/interop/to_arrow_host_test.cpp @@ -0,0 +1,1117 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nanoarrow_utils.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +using vector_of_columns = std::vector>; + +struct BaseToArrowHostFixture : public cudf::test::BaseFixture { + template + std::enable_if_t() and !std::is_same_v, void> compare_subset( + ArrowArrayView const* expected, + int64_t start_offset_expected, + ArrowArrayView const* actual, + int64_t start_offset_actual, + int64_t length) + { + for (int64_t i = 0; i < length; ++i) { + const bool is_null = ArrowArrayViewIsNull(expected, start_offset_expected + i); + EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, start_offset_actual + i)); + if (is_null) continue; + + const auto expected_val = ArrowArrayViewGetIntUnsafe(expected, start_offset_expected + i); + const auto actual_val = ArrowArrayViewGetIntUnsafe(actual, start_offset_actual + i); + + EXPECT_EQ(expected_val, actual_val); + } + } + + template + std::enable_if_t, void> compare_subset( + ArrowArrayView const* expected, + int64_t start_offset_expected, + ArrowArrayView const* actual, + int64_t start_offset_actual, + int64_t length) + { + for (int64_t i = 0; i < length; ++i) { + const bool is_null = ArrowArrayViewIsNull(expected, start_offset_expected + i); + EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, start_offset_actual + i)); + if (is_null) continue; + + const auto expected_view = ArrowArrayViewGetBytesUnsafe(expected, start_offset_expected + i); + const auto actual_view = ArrowArrayViewGetBytesUnsafe(actual, start_offset_actual + i); + + EXPECT_EQ(expected_view.size_bytes, actual_view.size_bytes); + EXPECT_TRUE( + 0 == std::memcmp(expected_view.data.data, actual_view.data.data, expected_view.size_bytes)); + } + } + + void compare_child_subset(ArrowArrayView const* expected, + int64_t exp_start_offset, + ArrowArrayView const* actual, + int64_t act_start_offset, + int64_t length) + { + EXPECT_EQ(expected->storage_type, actual->storage_type); + EXPECT_EQ(expected->n_children, actual->n_children); + + switch (expected->storage_type) { + case NANOARROW_TYPE_LIST: + for (int64_t i = 0; i < length; ++i) { + const auto expected_start = exp_start_offset + i; + const auto actual_start = act_start_offset + i; + + // ArrowArrayViewIsNull accounts for the array offset, so we can properly + // compare the validity of indexes + const bool is_null = ArrowArrayViewIsNull(expected, expected_start); + EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, actual_start)); + if (is_null) continue; + + // ArrowArrayViewListChildOffset does not account for array offset, so we need + // to add the offset to the index in order to get the correct offset into the list + const int64_t start_offset_expected = + ArrowArrayViewListChildOffset(expected, expected->offset + expected_start); + const int64_t start_offset_actual = + ArrowArrayViewListChildOffset(actual, actual->offset + actual_start); + + const int64_t end_offset_expected = + ArrowArrayViewListChildOffset(expected, expected->offset + expected_start + 1); + const int64_t end_offset_actual = + ArrowArrayViewListChildOffset(actual, actual->offset + actual_start + 1); + + // verify the list lengths are the same + EXPECT_EQ(end_offset_expected - start_offset_expected, + end_offset_actual - start_offset_actual); + // compare the list values + compare_child_subset(expected->children[0], + start_offset_expected, + actual->children[0], + start_offset_actual, + end_offset_expected - start_offset_expected); + } + break; + case NANOARROW_TYPE_STRUCT: + for (int64_t i = 0; i < length; ++i) { + SCOPED_TRACE("idx: " + std::to_string(i)); + const auto expected_start = exp_start_offset + i; + const auto actual_start = act_start_offset + i; + + const bool is_null = ArrowArrayViewIsNull(expected, expected_start); + EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, actual_start)); + if (is_null) continue; + + for (int64_t child = 0; child < expected->n_children; ++child) { + SCOPED_TRACE("child: " + std::to_string(child)); + compare_child_subset(expected->children[child], + expected_start + expected->offset, + actual->children[child], + actual_start + actual->offset, + 1); + } + } + break; + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: + case NANOARROW_TYPE_BINARY: + case NANOARROW_TYPE_LARGE_BINARY: + compare_subset( + expected, exp_start_offset, actual, act_start_offset, length); + break; + default: + compare_subset(expected, exp_start_offset, actual, act_start_offset, length); + break; + } + } + + void compare_arrays(ArrowArrayView const* expected, ArrowArrayView const* actual) + { + EXPECT_EQ(expected->length, actual->length); + EXPECT_EQ(expected->null_count, actual->null_count); + EXPECT_EQ(expected->offset, actual->offset); + EXPECT_EQ(expected->n_children, actual->n_children); + EXPECT_EQ(expected->storage_type, actual->storage_type); + + // cudf automatically pushes down nulls and purges non-empty, non-zero nulls + // from the children columns. So while we can memcmp the buffers for top + // level arrays, we need to do an "equivalence" comparison for nested + // arrays (lists and structs) by checking each index for null and skipping + // comparisons for children if null. + switch (expected->storage_type) { + case NANOARROW_TYPE_STRUCT: + // if we're a struct with no children, then we just skip + // attempting to compare the children + if (expected->n_children == 0) { + EXPECT_EQ(nullptr, actual->children); + break; + } + // otherwise we can fallthrough and do the same thing we do for lists + case NANOARROW_TYPE_LIST: + compare_child_subset(expected, 0, actual, 0, expected->length); + break; + default: + for (int64_t i = 0; i < actual->array->n_buffers; ++i) { + SCOPED_TRACE("buffer " + std::to_string(i)); + auto expected_buf = expected->buffer_views[i]; + auto actual_buf = actual->buffer_views[i]; + + EXPECT_TRUE(0 == std::memcmp(expected_buf.data.data, + actual_buf.data.data, + expected_buf.size_bytes)); + } + } + + if (expected->dictionary != nullptr) { + EXPECT_NE(nullptr, actual->dictionary); + SCOPED_TRACE("dictionary"); + compare_arrays(expected->dictionary, actual->dictionary); + } else { + EXPECT_EQ(nullptr, actual->dictionary); + } + } +}; + +struct ToArrowHostDeviceTest : public BaseToArrowHostFixture {}; +template +struct ToArrowHostDeviceTestDurationsTest : public BaseToArrowHostFixture {}; + +TYPED_TEST_SUITE(ToArrowHostDeviceTestDurationsTest, cudf::test::DurationTypes); + +TEST_F(ToArrowHostDeviceTest, EmptyTable) +{ + auto [tbl, schema, arr] = get_nanoarrow_host_tables(0); + + auto got_arrow_host = cudf::to_arrow_host(tbl->view()); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, arr.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + + ArrowArrayViewReset(&expected); + ArrowArrayViewReset(&actual); +} + +TEST_F(ToArrowHostDeviceTest, DateTimeTable) +{ + auto data = std::initializer_list{1, 2, 3, 4, 5, 6}; + auto col = + cudf::test::fixed_width_column_wrapper(data); + cudf::table_view input_view({col}); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime( + expected_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + auto got_arrow_host = cudf::to_arrow_host(input_view); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + expected.length = data.size(); + expected.children[0]->length = data.size(); + ArrowArrayViewSetLength(expected.children[0], data.size()); + expected.children[0]->buffer_views[0].data.data = nullptr; + expected.children[0]->buffer_views[0].size_bytes = 0; + expected.children[0]->buffer_views[1].data.data = data.begin(); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input_view.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + BaseToArrowHostFixture::compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); + ArrowArrayViewReset(&actual); +} + +TYPED_TEST(ToArrowHostDeviceTestDurationsTest, DurationTable) +{ + using T = TypeParam; + + if (cudf::type_to_id() == cudf::type_id::DURATION_DAYS) { return; } + + auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}}; + auto col = cudf::test::fixed_width_column_wrapper(data); + + cudf::table_view input_view({col}); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + + ArrowSchemaInit(expected_schema->children[0]); + const ArrowTimeUnit arrow_unit = [&] { + switch (cudf::type_to_id()) { + case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND; + case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI; + case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO; + case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO; + default: CUDF_FAIL("Unsupported duration unit in arrow"); + } + }(); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime( + expected_schema->children[0], NANOARROW_TYPE_DURATION, arrow_unit, nullptr)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + auto got_arrow_host = cudf::to_arrow_host(input_view); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + + expected.length = data.size(); + expected.children[0]->length = data.size(); + ArrowArrayViewSetLength(expected.children[0], data.size()); + expected.children[0]->buffer_views[0].data.data = nullptr; + expected.children[0]->buffer_views[0].size_bytes = 0; + expected.children[0]->buffer_views[1].data.data = data.begin(); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + BaseToArrowHostFixture::compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input_view.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + BaseToArrowHostFixture::compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); +} + +TEST_F(ToArrowHostDeviceTest, NestedList) +{ + auto valids = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; }); + auto col = cudf::test::lists_column_wrapper( + {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids}); + cudf::table_view input_view({col}); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(expected_schema->children[0], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = ARROW_FLAG_NULLABLE; + + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(expected_schema->children[0]->children[0], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0]->children[0], "element")); + expected_schema->children[0]->children[0]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType( + expected_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaSetName(expected_schema->children[0]->children[0]->children[0], "element")); + expected_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE; + + auto got_arrow_host = cudf::to_arrow_host(input_view); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + auto list_arr = get_nanoarrow_list_array({6, 7, 8, 9}, {0, 1, 4}, {1, 0, 1, 1}); + std::vector offset{0, 0, 2}; + + ArrowBitmap mask; + ArrowBitmapInit(&mask); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 2)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 1)); + + nanoarrow::UniqueArray expected_arr; + EXPECT_EQ(NANOARROW_OK, + ArrowArrayInitFromSchema(expected_arr.get(), expected_schema.get(), nullptr)); + expected_arr->length = input_view.num_rows(); + expected_arr->null_count = 0; + + ArrowArraySetValidityBitmap(expected_arr->children[0], &mask); + expected_arr->children[0]->length = input_view.num_rows(); + expected_arr->children[0]->null_count = 1; + auto offset_buf = ArrowArrayBuffer(expected_arr->children[0], 1); + EXPECT_EQ( + NANOARROW_OK, + ArrowBufferAppend( + offset_buf, reinterpret_cast(offset.data()), offset.size() * sizeof(int32_t))); + list_arr.move(expected_arr->children[0]->children[0]); + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_arr.get(), nullptr)); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_arr.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input_view.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); +} + +TEST_F(ToArrowHostDeviceTest, StructColumn) +{ + // Create cudf table + auto nested_type_field_names = + std::vector>{{"string", "integral", "bool", "nested_list", "struct"}}; + auto str_col = + cudf::test::strings_column_wrapper{ + "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"} + .release(); + auto str_col2 = + cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release(); + int num_rows{str_col->size()}; + auto int_col = cudf::test::fixed_width_column_wrapper{{48, 27, 25}}.release(); + auto int_col2 = + cudf::test::fixed_width_column_wrapper{{12, 24, 47}, {1, 0, 1}}.release(); + auto bool_col = cudf::test::fixed_width_column_wrapper{{true, true, false}}.release(); + auto list_col = + cudf::test::lists_column_wrapper({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}}) + .release(); + vector_of_columns cols2; + cols2.push_back(std::move(str_col2)); + cols2.push_back(std::move(int_col2)); + auto [null_mask, null_count] = + cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper{{true, true, false}}); + auto sub_struct_col = + cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask)); + vector_of_columns cols; + cols.push_back(std::move(str_col)); + cols.push_back(std::move(int_col)); + cols.push_back(std::move(bool_col)); + cols.push_back(std::move(list_col)); + cols.push_back(std::move(sub_struct_col)); + + auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {}); + cudf::table_view input_view({struct_col->view()}); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema->children[0], 5)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + auto child = expected_schema->children[0]; + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string")); + child->children[0]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral")); + child->children[1]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool")); + child->children[2]->flags = 0; + + NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list")); + child->children[3]->flags = 0; + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element")); + child->children[3]->children[0]->flags = 0; + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64)); + NANOARROW_THROW_NOT_OK( + ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element")); + child->children[3]->children[0]->children[0]->flags = 0; + + ArrowSchemaInit(child->children[4]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct")); + + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2")); + NANOARROW_THROW_NOT_OK( + ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2")); + + // create nanoarrow table + // first our underlying arrays + std::vector str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}; + std::vector str2{"CUDF", "ROCKS", "EVERYWHERE"}; + auto str_array = get_nanoarrow_array(str); + auto int_array = get_nanoarrow_array({48, 27, 25}); + auto str2_array = get_nanoarrow_array(str2, {0, 1, 0}); + // struct null will get pushed down and superimposed on this array + auto int2_array = get_nanoarrow_array({12, 24, 47}, {1, 0, 0}); + auto bool_array = get_nanoarrow_array({true, true, false}); + auto list_arr = + get_nanoarrow_list_array({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9}); + std::vector offset{0, 3, 4, 6}; + + nanoarrow::UniqueArray expected_arr; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(expected_arr.get(), expected_schema.get(), nullptr)); + expected_arr->length = input_view.num_rows(); + + auto array_a = expected_arr->children[0]; + auto view_a = input_view.column(0); + array_a->length = view_a.size(); + array_a->null_count = view_a.null_count(); + + str_array.move(array_a->children[0]); + int_array.move(array_a->children[1]); + bool_array.move(array_a->children[2]); + + array_a->children[3]->length = input_view.num_rows(); + array_a->children[3]->null_count = 0; + + auto offset_buf = ArrowArrayBuffer(array_a->children[3], 1); + EXPECT_EQ( + NANOARROW_OK, + ArrowBufferAppend( + offset_buf, reinterpret_cast(offset.data()), offset.size() * sizeof(int32_t))); + list_arr.move(array_a->children[3]->children[0]); + + ArrowBitmap mask; + ArrowBitmapInit(&mask); + NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 3)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 2)); + NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1)); + + auto array_struct = array_a->children[4]; + auto view_struct = view_a.child(4); + ArrowArraySetValidityBitmap(array_struct, &mask); + array_struct->null_count = view_struct.null_count(); + array_struct->length = view_struct.size(); + + str2_array.move(array_struct->children[0]); + int2_array.move(array_struct->children[1]); + + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_arr.get(), nullptr)); + + auto got_arrow_host = cudf::to_arrow_host(input_view); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_arr.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input_view.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); +} + +template +using fp_wrapper = cudf::test::fixed_point_column_wrapper; + +TEST_F(ToArrowHostDeviceTest, FixedPoint32Table) +{ + using namespace numeric; + + for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) { + auto const col = fp_wrapper({-1, 2, 3, 4, 5, 6}, scale_type{scale}); + auto const input = cudf::table_view({col}); + + auto const data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6}; + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision(), + -scale)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + nanoarrow::UniqueArray expected_array; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr)); + expected_array->length = input.num_rows(); + + get_nanoarrow_array<__int128_t>(data).move(expected_array->children[0]); + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr)); + + auto got_arrow_host = cudf::to_arrow_host(input); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); + } +} + +TEST_F(ToArrowHostDeviceTest, FixedPoint64Table) +{ + using namespace numeric; + + for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) { + auto const col = fp_wrapper({-1, 2, 3, 4, 5, 6}, scale_type{scale}); + auto const input = cudf::table_view({col}); + + auto const data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6}; + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision(), + -scale)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + nanoarrow::UniqueArray expected_array; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr)); + expected_array->length = input.num_rows(); + + get_nanoarrow_array<__int128_t>(data).move(expected_array->children[0]); + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr)); + + auto got_arrow_host = cudf::to_arrow_host(input); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); + } +} + +TEST_F(ToArrowHostDeviceTest, FixedPoint128Table) +{ + using namespace numeric; + + for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) { + auto const col = fp_wrapper<__int128_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale}); + auto const input = cudf::table_view({col}); + + auto const data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6}; + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision<__int128_t>(), + -scale)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + nanoarrow::UniqueArray expected_array; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr)); + expected_array->length = input.num_rows(); + + get_nanoarrow_array<__int128_t>(data).move(expected_array->children[0]); + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr)); + + auto got_arrow_host = cudf::to_arrow_host(input); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); + } +} + +TEST_F(ToArrowHostDeviceTest, FixedPoint32TableLarge) +{ + using namespace numeric; + auto constexpr NUM_ELEMENTS = 1000; + + for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) { + auto const iota = thrust::make_counting_iterator(1); + auto const col = fp_wrapper(iota, iota + NUM_ELEMENTS, scale_type{scale}); + auto const input = cudf::table_view({col}); + + auto expect_data = std::vector<__int128_t>(NUM_ELEMENTS); + std::iota(expect_data.begin(), expect_data.end(), 1); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision(), + -scale)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + nanoarrow::UniqueArray expected_array; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr)); + expected_array->length = input.num_rows(); + + get_nanoarrow_array<__int128_t>(expect_data).move(expected_array->children[0]); + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr)); + + auto got_arrow_host = cudf::to_arrow_host(input); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); + } +} + +TEST_F(ToArrowHostDeviceTest, FixedPoint64TableLarge) +{ + using namespace numeric; + auto constexpr NUM_ELEMENTS = 1000; + + for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) { + auto const iota = thrust::make_counting_iterator(1); + auto const col = fp_wrapper(iota, iota + NUM_ELEMENTS, scale_type{scale}); + auto const input = cudf::table_view({col}); + + auto expect_data = std::vector<__int128_t>(NUM_ELEMENTS); + std::iota(expect_data.begin(), expect_data.end(), 1); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision(), + -scale)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + nanoarrow::UniqueArray expected_array; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr)); + expected_array->length = input.num_rows(); + + get_nanoarrow_array<__int128_t>(expect_data).move(expected_array->children[0]); + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr)); + + auto got_arrow_host = cudf::to_arrow_host(input); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); + } +} + +TEST_F(ToArrowHostDeviceTest, FixedPoint128TableLarge) +{ + using namespace numeric; + auto constexpr NUM_ELEMENTS = 1000; + + for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) { + auto const iota = thrust::make_counting_iterator(1); + auto const col = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, scale_type{scale}); + auto const input = cudf::table_view({col}); + + auto expect_data = std::vector<__int128_t>(NUM_ELEMENTS); + std::iota(expect_data.begin(), expect_data.end(), 1); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision<__int128_t>(), + -scale)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + nanoarrow::UniqueArray expected_array; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr)); + expected_array->length = input.num_rows(); + + get_nanoarrow_array<__int128_t>(expect_data).move(expected_array->children[0]); + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr)); + + auto got_arrow_host = cudf::to_arrow_host(input); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); + } +} + +TEST_F(ToArrowHostDeviceTest, FixedPoint32TableNullsSimple) +{ + using namespace numeric; + + for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) { + auto const data = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0}; + auto const validity = std::vector{1, 1, 1, 1, 1, 1, 0, 0}; + auto const col = + fp_wrapper({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale}); + auto const input = cudf::table_view({col}); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision(), + -scale)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + nanoarrow::UniqueArray expected_array; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr)); + expected_array->length = input.num_rows(); + + get_nanoarrow_array<__int128_t>(data, validity).move(expected_array->children[0]); + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr)); + + auto got_arrow_host = cudf::to_arrow_host(input); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); + } +} + +TEST_F(ToArrowHostDeviceTest, FixedPoint64TableNullsSimple) +{ + using namespace numeric; + + for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) { + auto const data = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0}; + auto const validity = std::vector{1, 1, 1, 1, 1, 1, 0, 0}; + auto const col = + fp_wrapper({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale}); + auto const input = cudf::table_view({col}); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision(), + -scale)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + nanoarrow::UniqueArray expected_array; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr)); + expected_array->length = input.num_rows(); + + get_nanoarrow_array<__int128_t>(data, validity).move(expected_array->children[0]); + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr)); + + auto got_arrow_host = cudf::to_arrow_host(input); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); + } +} + +TEST_F(ToArrowHostDeviceTest, FixedPoint128TableNullsSimple) +{ + using namespace numeric; + + for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) { + auto const data = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0}; + auto const validity = std::vector{1, 1, 1, 1, 1, 1, 0, 0}; + auto const col = + fp_wrapper<__int128_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale}); + auto const input = cudf::table_view({col}); + + nanoarrow::UniqueSchema expected_schema; + ArrowSchemaInit(expected_schema.get()); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1)); + ArrowSchemaInit(expected_schema->children[0]); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0], + NANOARROW_TYPE_DECIMAL128, + cudf::detail::max_precision<__int128_t>(), + -scale)); + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a")); + expected_schema->children[0]->flags = 0; + + nanoarrow::UniqueArray expected_array; + NANOARROW_THROW_NOT_OK( + ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr)); + expected_array->length = input.num_rows(); + + get_nanoarrow_array<__int128_t>(data, validity).move(expected_array->children[0]); + NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr)); + + auto got_arrow_host = cudf::to_arrow_host(input); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + got_arrow_host = cudf::to_arrow_host(input.column(0)); + NANOARROW_THROW_NOT_OK( + ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(expected.children[0], &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); + } +} + +struct ToArrowHostDeviceTestSlice + : public ToArrowHostDeviceTest, + public ::testing::WithParamInterface> {}; + +TEST_P(ToArrowHostDeviceTestSlice, SliceTest) +{ + auto [table, expected_schema, expected_array] = get_nanoarrow_host_tables(10000); + auto cudf_table_view = table->view(); + auto const [start, end] = GetParam(); + + slice_host_nanoarrow(expected_array.get(), start, end); + auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0]; + auto got_arrow_host = cudf::to_arrow_host(sliced_cudf_table); + EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type); + EXPECT_EQ(-1, got_arrow_host->device_id); + EXPECT_EQ(nullptr, got_arrow_host->sync_event); + + ArrowArrayView expected, actual; + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr)); + + NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr)); + NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr)); + compare_arrays(&expected, &actual); + ArrowArrayViewReset(&actual); + + ArrowArrayViewReset(&expected); +} + +INSTANTIATE_TEST_CASE_P(ToArrowHostDeviceTest, + ToArrowHostDeviceTestSlice, + ::testing::Values(std::make_tuple(0, 10000), + std::make_tuple(100, 3000), + std::make_tuple(0, 0), + std::make_tuple(0, 3000))); From 743264f6ac924fdbec58fad666f989b14b901a98 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Wed, 24 Jul 2024 05:32:31 -0500 Subject: [PATCH 11/15] Warn on cuDF failure when `POLARS_VERBOSE` is true (#16308) Just something quick to get us started here Closes https://github.com/rapidsai/cudf/issues/16256 Authors: - https://github.com/brandon-b-miller - Lawrence Mitchell (https://github.com/wence-) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/16308 --- python/cudf_polars/cudf_polars/callback.py | 12 +++++++- python/cudf_polars/tests/test_config.py | 34 ++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 python/cudf_polars/tests/test_config.py diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py index 764cdd3b3ca..f31193aa938 100644 --- a/python/cudf_polars/cudf_polars/callback.py +++ b/python/cudf_polars/cudf_polars/callback.py @@ -5,11 +5,15 @@ from __future__ import annotations +import os +import warnings from functools import partial from typing import TYPE_CHECKING import nvtx +from polars.exceptions import PerformanceWarning + from cudf_polars.dsl.translate import translate_ir if TYPE_CHECKING: @@ -61,6 +65,12 @@ def execute_with_cudf( try: with nvtx.annotate(message="ConvertIR", domain="cudf_polars"): nt.set_udf(partial(_callback, translate_ir(nt))) - except exception: + except exception as e: + if bool(int(os.environ.get("POLARS_VERBOSE", 0))): + warnings.warn( + f"Query execution with GPU not supported, reason: {type(e)}: {e}", + PerformanceWarning, + stacklevel=2, + ) if raise_on_fail: raise diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py new file mode 100644 index 00000000000..5b4bba55552 --- /dev/null +++ b/python/cudf_polars/tests/test_config.py @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.dsl.ir import IR +from cudf_polars.testing.asserts import ( + assert_gpu_result_equal, + assert_ir_translation_raises, +) + + +def test_polars_verbose_warns(monkeypatch): + def raise_unimplemented(self): + raise NotImplementedError("We don't support this") + + monkeypatch.setattr(IR, "__post_init__", raise_unimplemented) + q = pl.LazyFrame({}) + # Ensure that things raise + assert_ir_translation_raises(q, NotImplementedError) + with ( + pl.Config(verbose=True), + pytest.raises(pl.exceptions.ComputeError), + pytest.warns( + pl.exceptions.PerformanceWarning, + match="Query execution with GPU not supported", + ), + ): + # And ensure that collecting issues the correct warning. + assert_gpu_result_equal(q) From 7191b74ce244518f17ef65e701f5a262f1c5cf8a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 24 Jul 2024 03:55:48 -1000 Subject: [PATCH 12/15] Align Index __init__ APIs with pandas 2.x (#16362) * It would be nice to have `Index`'s constructor to not go through `IndexMeta.__call__`, but I think that would be a separate effort * There were a couple `verify_integrity` keyword arguments added that don't raise a `NotImplementedError` since there's not support, but I don't think it's worth making this case falling back in `cudf.pandas` as it's just a validation and won't affect further behavior with the object Authors: - Matthew Roeschke (https://github.com/mroeschke) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/16362 --- docs/cudf/source/conf.py | 1 + python/cudf/cudf/core/index.py | 48 ++++++++++++++++++++++------- python/cudf/cudf/core/multiindex.py | 2 +- 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index c3c14ac8cad..f544536fb31 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -556,6 +556,7 @@ def on_missing_reference(app, env, node, contnode): ("py:class", "Dtype"), # The following are erroneously warned due to # https://github.com/sphinx-doc/sphinx/issues/11225 + ("py:obj", "cudf.Index.values_host"), ("py:class", "pa.Array"), ("py:class", "ScalarLike"), ("py:class", "ParentType"), diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 73b7298410a..1c48b8f4f2d 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -78,6 +78,11 @@ class IndexMeta(type): """Custom metaclass for Index that overrides instance/subclass tests.""" def __call__(cls, data, *args, **kwargs): + if kwargs.get("tupleize_cols", True) is not True: + raise NotImplementedError( + "tupleize_cols is currently not supported." + ) + if cls is Index: return as_index( arbitrary=data, @@ -997,21 +1002,23 @@ def __dask_tokenize__(self): class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta): """ - An array of orderable values that represent the indices of another Column + Immutable sequence used for indexing and alignment. - Attributes - ---------- - _values: A Column object - name: A string + The basic object storing axis labels for all pandas objects. Parameters ---------- - data : Column - The Column of data for this index - name : str optional - The name of the Index. If not provided, the Index adopts the value - Column's name. Otherwise if this name is different from the value - Column's, the data Column will be cloned to adopt this name. + data : array-like (1-dimensional) + dtype : str, numpy.dtype, or ExtensionDtype, optional + Data type for the output Index. If not specified, this will be + inferred from `data`. + copy : bool, default False + Copy input data. + name : object + Name to be stored in the index. + tupleize_cols : bool (default: True) + When True, attempt to create a MultiIndex if possible. + Currently not supported. """ @_performance_tracking @@ -1735,8 +1742,18 @@ def __init__( if tz is not None: raise NotImplementedError("tz is not yet supported") if normalize is not False: + warnings.warn( + "The 'normalize' keyword is " + "deprecated and will be removed in a future version. ", + FutureWarning, + ) raise NotImplementedError("normalize == True is not yet supported") if closed is not None: + warnings.warn( + "The 'closed' keyword is " + "deprecated and will be removed in a future version. ", + FutureWarning, + ) raise NotImplementedError("closed is not yet supported") if ambiguous != "raise": raise NotImplementedError("ambiguous is not yet supported") @@ -2480,6 +2497,14 @@ def __init__( if freq is not None: raise NotImplementedError("freq is not yet supported") + if closed is not None: + warnings.warn( + "The 'closed' keyword is " + "deprecated and will be removed in a future version. ", + FutureWarning, + ) + raise NotImplementedError("closed is not yet supported") + if unit is not None: warnings.warn( "The 'unit' keyword is " @@ -2863,6 +2888,7 @@ def __init__( dtype=None, copy: bool = False, name=None, + verify_integrity: bool = True, ): name = _getdefault_name(data, name=name) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index ff4b06c6334..dfc596bf279 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -150,7 +150,7 @@ def __init__( dtype=None, copy=False, name=None, - **kwargs, + verify_integrity=True, ): if sortorder is not None: raise NotImplementedError("sortorder is not yet supported") From 8fcf72a787acb0168c97d11b8ab9130146e9b37e Mon Sep 17 00:00:00 2001 From: Alessandro Bellina Date: Wed, 24 Jul 2024 12:06:29 -0500 Subject: [PATCH 13/15] [JNI] Add setKernelPinnedCopyThreshold and setPinnedAllocationThreshold (#16288) In 24.08 two new cuDF methods are being added, and the second method is still in flight (see: https://github.com/rapidsai/cudf/pull/16206): ``` cudf::set_kernel_pinned_copy_threshold cudf::set_allocate_host_as_pinned_threshold ``` We'd like to expose these methods in our JNI layer. I created a Cudf.java with the two static methods, and put the definitions in CudfJni.cpp. Marked as draft since I need https://github.com/rapidsai/cudf/pull/16206 to merge, and we are still testing it. Authors: - Alessandro Bellina (https://github.com/abellina) - Nghia Truong (https://github.com/ttnghia) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/16288 --- java/src/main/java/ai/rapids/cudf/Cudf.java | 36 +++++++++++++++++++++ java/src/main/native/src/CudfJni.cpp | 25 ++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 java/src/main/java/ai/rapids/cudf/Cudf.java diff --git a/java/src/main/java/ai/rapids/cudf/Cudf.java b/java/src/main/java/ai/rapids/cudf/Cudf.java new file mode 100644 index 00000000000..d09e2f87ed4 --- /dev/null +++ b/java/src/main/java/ai/rapids/cudf/Cudf.java @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package ai.rapids.cudf; + +public class Cudf { + + static { + NativeDepsLoader.loadNativeDeps(); + } + + /** + * cuDF copies that are smaller than the threshold will use a kernel to copy, instead + * of cudaMemcpyAsync. + */ + public static native void setKernelPinnedCopyThreshold(long kernelPinnedCopyThreshold); + + /** + * cudf allocations that are smaller than the threshold will use the pinned host + * memory resource. + */ + public static native void setPinnedAllocationThreshold(long pinnedAllocationThreshold); +} diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp index 698a8f6ff02..2860dc2e4b2 100644 --- a/java/src/main/native/src/CudfJni.cpp +++ b/java/src/main/native/src/CudfJni.cpp @@ -18,6 +18,7 @@ #include #include +#include #include @@ -201,4 +202,28 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_isPtdsEnabled(JNIEnv* env, j return cudf::jni::is_ptds_enabled; } +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cudf_setKernelPinnedCopyThreshold(JNIEnv* env, + jclass clazz, + jlong jthreshold) +{ + try { + cudf::jni::auto_set_device(env); + auto threshold = static_cast(jthreshold); + cudf::set_kernel_pinned_copy_threshold(threshold); + } + CATCH_STD(env, ) +} + +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cudf_setPinnedAllocationThreshold(JNIEnv* env, + jclass clazz, + jlong jthreshold) +{ + try { + cudf::jni::auto_set_device(env); + auto threshold = static_cast(jthreshold); + cudf::set_allocate_host_as_pinned_threshold(threshold); + } + CATCH_STD(env, ) +} + } // extern "C" From 73937fbabaeea76665663ed23688b1cac61b7ee9 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 24 Jul 2024 16:42:00 -0400 Subject: [PATCH 14/15] Migrate lists/filling to pylibcudf (#16189) Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Thomas Li (https://github.com/lithomas1) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/16189 --- .../_lib/pylibcudf/libcudf/lists/filling.pxd | 19 ++++++++++ python/cudf/cudf/_lib/pylibcudf/lists.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/lists.pyx | 38 +++++++++++++++++++ .../cudf/cudf/pylibcudf_tests/test_lists.py | 16 ++++++++ 4 files changed, 75 insertions(+) create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd new file mode 100644 index 00000000000..8403fd179f7 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd @@ -0,0 +1,19 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view + + +cdef extern from "cudf/lists/filling.hpp" namespace "cudf::lists" nogil: + cdef unique_ptr[column] sequences( + const column_view& starts, + const column_view& sizes, + ) except + + + cdef unique_ptr[column] sequences( + const column_view& starts, + const column_view& steps, + const column_view& sizes, + ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index cacecae6010..6e9bd5ff76b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -36,4 +36,6 @@ cpdef Column extract_list_element(Column, ColumnOrSizeType) cpdef Column count_elements(Column) +cpdef Column sequences(Column, Column, Column steps = *) + cpdef Column sort_lists(Column, bool, null_order, bool stable = *) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index b5661a3e634..3837eaaca78 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.lists cimport ( contains as cpp_contains, explode as cpp_explode, + filling as cpp_filling, gather as cpp_gather, reverse as cpp_reverse, ) @@ -326,6 +327,43 @@ cpdef Column count_elements(Column input): return Column.from_libcudf(move(c_result)) +cpdef Column sequences(Column starts, Column sizes, Column steps = None): + """Create a lists column in which each row contains a sequence of + values specified by a tuple of (start, step, size) parameters. + + For details, see :cpp:func:`sequences`. + + Parameters + ---------- + starts : Column + First values in the result sequences. + sizes : Column + Numbers of values in the result sequences. + steps : Optional[Column] + Increment values for the result sequences. + + Returns + ------- + Column + The result column containing generated sequences. + """ + cdef unique_ptr[column] c_result + + if steps is not None: + with nogil: + c_result = move(cpp_filling.sequences( + starts.view(), + steps.view(), + sizes.view(), + )) + else: + with nogil: + c_result = move(cpp_filling.sequences( + starts.view(), + sizes.view(), + )) + return Column.from_libcudf(move(c_result)) + cpdef Column sort_lists( Column input, bool ascending, diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index 87472f6d59b..0b2e0e00ce8 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -198,6 +198,22 @@ def test_count_elements(test_data): assert_column_eq(expect, res) +def test_sequences(): + starts = plc.interop.from_arrow(pa.array([0, 1, 2, 3, 4])) + steps = plc.interop.from_arrow(pa.array([2, 1, 1, 1, -3])) + sizes = plc.interop.from_arrow(pa.array([0, 2, 2, 1, 3])) + + res1 = plc.lists.sequences(starts, sizes, steps) + res2 = plc.lists.sequences(starts, sizes) + + expect1 = pa.array([[], [1, 2], [2, 3], [3], [4, 1, -2]]) + expect2 = pa.array([[], [1, 2], [2, 3], [3], [4, 5, 6]]) + + assert_column_eq(expect1, res1) + + assert_column_eq(expect2, res2) + + @pytest.mark.parametrize( "ascending,na_position,expected", [ From 8bba6dfad239b4fd69a82acbc5dd7707ba576cce Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 24 Jul 2024 18:16:03 -0400 Subject: [PATCH 15/15] Migrate lists/set_operations to pylibcudf (#16190) Apart of #15162 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - Thomas Li (https://github.com/lithomas1) URL: https://github.com/rapidsai/cudf/pull/16190 --- .../libcudf/lists/set_operations.pxd | 39 ++++ python/cudf/cudf/_lib/pylibcudf/lists.pxd | 8 + python/cudf/cudf/_lib/pylibcudf/lists.pyx | 203 +++++++++++++++++- .../cudf/cudf/pylibcudf_tests/test_lists.py | 90 ++++++++ 4 files changed, 339 insertions(+), 1 deletion(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd new file mode 100644 index 00000000000..eb796897f87 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd @@ -0,0 +1,39 @@ +# Copyright (c) 2021-2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport ( + lists_column_view, +) +from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality + + +cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil: + cdef unique_ptr[column] difference_distinct( + const lists_column_view& lhs, + const lists_column_view& rhs, + null_equality nulls_equal, + nan_equality nans_equal + ) except + + + cdef unique_ptr[column] have_overlap( + const lists_column_view& lhs, + const lists_column_view& rhs, + null_equality nulls_equal, + nan_equality nans_equal + ) except + + + cdef unique_ptr[column] intersect_distinct( + const lists_column_view& lhs, + const lists_column_view& rhs, + null_equality nulls_equal, + nan_equality nans_equal + ) except + + + cdef unique_ptr[column] union_distinct( + const lists_column_view& lhs, + const lists_column_view& rhs, + null_equality nulls_equal, + nan_equality nans_equal + ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd index 6e9bd5ff76b..4e2406c2aea 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd @@ -39,3 +39,11 @@ cpdef Column count_elements(Column) cpdef Column sequences(Column, Column, Column steps = *) cpdef Column sort_lists(Column, bool, null_order, bool stable = *) + +cpdef Column difference_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) + +cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*) + +cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) + +cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*) diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx index 3837eaaca78..7555c8c6970 100644 --- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx @@ -12,6 +12,7 @@ from cudf._lib.pylibcudf.libcudf.lists cimport ( filling as cpp_filling, gather as cpp_gather, reverse as cpp_reverse, + set_operations as cpp_set_operations, ) from cudf._lib.pylibcudf.libcudf.lists.combine cimport ( concatenate_list_elements as cpp_concatenate_list_elements, @@ -29,7 +30,13 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport ( stable_sort_lists as cpp_stable_sort_lists, ) from cudf._lib.pylibcudf.libcudf.table.table cimport table -from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type +from cudf._lib.pylibcudf.libcudf.types cimport ( + nan_equality, + null_equality, + null_order, + order, + size_type, +) from cudf._lib.pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType from .column cimport Column, ListColumnView @@ -413,3 +420,197 @@ cpdef Column sort_lists( na_position, )) return Column.from_libcudf(move(c_result)) + + +cpdef Column difference_distinct( + Column lhs, + Column rhs, + bool nulls_equal=True, + bool nans_equal=True +): + """Create a column of index values indicating the position of a search + key row within the corresponding list row in the lists column. + + For details, see :cpp:func:`difference_distinct`. + + Parameters + ---------- + lhs : Column + The input lists column of elements that may be included. + rhs : Column + The input lists column of elements to exclude. + nulls_equal : bool, default True + If true, null elements are considered equal. Otherwise, unequal. + nans_equal : bool, default True + If true, libcudf will treat nan elements from {-nan, +nan} + as equal. Otherwise, unequal. Otherwise, unequal. + + Returns + ------- + Column + A lists column containing the difference results. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView lhs_view = lhs.list_view() + cdef ListColumnView rhs_view = rhs.list_view() + + cdef null_equality c_nulls_equal = ( + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL + ) + cdef nan_equality c_nans_equal = ( + nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL + ) + + with nogil: + c_result = move(cpp_set_operations.difference_distinct( + lhs_view.view(), + rhs_view.view(), + c_nulls_equal, + c_nans_equal, + )) + return Column.from_libcudf(move(c_result)) + + +cpdef Column have_overlap( + Column lhs, + Column rhs, + bool nulls_equal=True, + bool nans_equal=True +): + """Check if lists at each row of the given lists columns overlap. + + For details, see :cpp:func:`have_overlap`. + + Parameters + ---------- + lhs : Column + The input lists column for one side. + rhs : Column + The input lists column for the other side. + nulls_equal : bool, default True + If true, null elements are considered equal. Otherwise, unequal. + nans_equal : bool, default True + If true, libcudf will treat nan elements from {-nan, +nan} + as equal. Otherwise, unequal. Otherwise, unequal. + + Returns + ------- + Column + A column containing the check results. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView lhs_view = lhs.list_view() + cdef ListColumnView rhs_view = rhs.list_view() + + cdef null_equality c_nulls_equal = ( + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL + ) + cdef nan_equality c_nans_equal = ( + nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL + ) + + with nogil: + c_result = move(cpp_set_operations.have_overlap( + lhs_view.view(), + rhs_view.view(), + c_nulls_equal, + c_nans_equal, + )) + return Column.from_libcudf(move(c_result)) + + +cpdef Column intersect_distinct( + Column lhs, + Column rhs, + bool nulls_equal=True, + bool nans_equal=True +): + """Create a lists column of distinct elements common to two input lists columns. + + For details, see :cpp:func:`intersect_distinct`. + + Parameters + ---------- + lhs : Column + The input lists column of elements that may be included. + rhs : Column + The input lists column of elements to exclude. + nulls_equal : bool, default True + If true, null elements are considered equal. Otherwise, unequal. + nans_equal : bool, default True + If true, libcudf will treat nan elements from {-nan, +nan} + as equal. Otherwise, unequal. Otherwise, unequal. + + Returns + ------- + Column + A lists column containing the intersection results. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView lhs_view = lhs.list_view() + cdef ListColumnView rhs_view = rhs.list_view() + + cdef null_equality c_nulls_equal = ( + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL + ) + cdef nan_equality c_nans_equal = ( + nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL + ) + + with nogil: + c_result = move(cpp_set_operations.intersect_distinct( + lhs_view.view(), + rhs_view.view(), + c_nulls_equal, + c_nans_equal, + )) + return Column.from_libcudf(move(c_result)) + + +cpdef Column union_distinct( + Column lhs, + Column rhs, + bool nulls_equal=True, + bool nans_equal=True +): + """Create a lists column of distinct elements found in + either of two input lists columns. + + For details, see :cpp:func:`union_distinct`. + + Parameters + ---------- + lhs : Column + The input lists column of elements that may be included. + rhs : Column + The input lists column of elements to exclude. + nulls_equal : bool, default True + If true, null elements are considered equal. Otherwise, unequal. + nans_equal : bool, default True + If true, libcudf will treat nan elements from {-nan, +nan} + as equal. Otherwise, unequal. Otherwise, unequal. + + Returns + ------- + Column + A lists column containing the union results. + """ + cdef unique_ptr[column] c_result + cdef ListColumnView lhs_view = lhs.list_view() + cdef ListColumnView rhs_view = rhs.list_view() + + cdef null_equality c_nulls_equal = ( + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL + ) + cdef nan_equality c_nans_equal = ( + nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL + ) + + with nogil: + c_result = move(cpp_set_operations.union_distinct( + lhs_view.view(), + rhs_view.view(), + c_nulls_equal, + c_nans_equal, + )) + return Column.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py index 0b2e0e00ce8..f135ab4ccff 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_lists.py +++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py @@ -1,5 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +import numpy as np import pyarrow as pa import pytest from utils import assert_column_eq @@ -22,6 +23,13 @@ def column(): return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32()) +@pytest.fixture +def set_lists_column(): + lhs = [[np.nan, np.nan, 2, 1, 2], [1, 2, 3], None, [4, None, 5]] + rhs = [[np.nan, 1, 2, 3], [4, 5], [None, 7, 8], [None, None]] + return lhs, rhs + + @pytest.fixture def lists_column(): return [[4, 2, 3, 1], [1, 2, None, 4], [-10, 10, 10, 0]] @@ -253,3 +261,85 @@ def test_sort_lists(lists_column, ascending, na_position, expected): assert_column_eq(expect, res) assert_column_eq(expect, res_stable) + + +@pytest.mark.parametrize( + "set_operation,nans_equal,nulls_equal,expected", + [ + ( + plc.lists.difference_distinct, + True, + True, + [[], [1, 2, 3], None, [4, 5]], + ), + ( + plc.lists.difference_distinct, + False, + True, + [[], [1, 2, 3], None, [4, None, 5]], + ), + ( + plc.lists.have_overlap, + True, + True, + [True, False, None, True], + ), + ( + plc.lists.have_overlap, + False, + False, + [True, False, None, False], + ), + ( + plc.lists.intersect_distinct, + True, + True, + [[np.nan, 1, 2], [], None, [None]], + ), + ( + plc.lists.intersect_distinct, + True, + False, + [[1, 2], [], None, [None]], + ), + ( + plc.lists.union_distinct, + False, + True, + [ + [np.nan, 2, 1, 3], + [1, 2, 3, 4, 5], + None, + [4, None, 5, None, None], + ], + ), + ( + plc.lists.union_distinct, + False, + False, + [ + [np.nan, np.nan, 2, 1, np.nan, 3], + [1, 2, 3, 4, 5], + None, + [4, None, 5, None, None], + ], + ), + ], +) +def test_set_operations( + set_lists_column, set_operation, nans_equal, nulls_equal, expected +): + lhs, rhs = set_lists_column + + res = set_operation( + plc.interop.from_arrow(pa.array(lhs)), + plc.interop.from_arrow(pa.array(rhs)), + nans_equal, + nulls_equal, + ) + + if set_operation != plc.lists.have_overlap: + expect = pa.array(expected, type=pa.list_(pa.float64())) + else: + expect = pa.array(expected) + assert_column_eq(expect, res)