diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 35cf90411f2..54070ab6f5a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -365,6 +365,7 @@ add_library( src/interop/to_arrow_device.cu src/interop/from_arrow_device.cu src/interop/from_arrow_host.cu + src/interop/from_arrow_stream.cu src/interop/to_arrow_schema.cpp src/interop/detail/arrow_allocator.cpp src/io/avro/avro.cpp diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 56ec62fa6e1..502ffb9ba4f 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -50,6 +50,8 @@ struct ArrowSchema; struct ArrowArray; +struct ArrowArrayStream; + namespace cudf { /** * @addtogroup interop_dlpack @@ -367,10 +369,11 @@ std::unique_ptr from_arrow( * @param mr Device memory resource used to allocate `cudf::table` * @return cudf table generated from given arrow data */ -std::unique_ptr from_arrow(ArrowSchema const* schema, - ArrowArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr from_arrow( + ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input @@ -385,10 +388,11 @@ std::unique_ptr from_arrow(ArrowSchema const* schema, * @param mr Device memory resource used to allocate `cudf::column` * @return cudf column generated from given arrow data */ -std::unique_ptr from_arrow_column(ArrowSchema const* schema, - ArrowArray const* input, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr); +std::unique_ptr from_arrow_column( + ArrowSchema const* schema, + ArrowArray const* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::table` from given ArrowDeviceArray input @@ -414,6 +418,24 @@ std::unique_ptr from_arrow_host( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create `cudf::table` from given ArrowArrayStream input + * + * @throws std::invalid_argument if input is NULL + * + * The conversion WILL release the input ArrayArrayStream and its constituent + * arrays or schema since Arrow streams are not suitable for multiple reads. + * + * @param input `ArrowArrayStream` pointer to object that will produce ArrowArray data + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to perform cuda allocation + * @return cudf table generated from the given Arrow data + */ +std::unique_ptr
from_arrow_stream( + ArrowArrayStream* input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Create `cudf::column` from given ArrowDeviceArray input * diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index 47e74a5cb48..6acbafd24fb 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -463,10 +463,6 @@ void traverse_children::operator()(host_span */ void bounds_and_type_check(host_span cols, rmm::cuda_stream_view stream) { - CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()), - "Type mismatch in columns to concatenate.", - cudf::data_type_error); - // total size of all concatenated rows size_t const total_row_count = std::accumulate(cols.begin(), cols.end(), std::size_t{}, [](size_t a, auto const& b) { @@ -476,6 +472,21 @@ void bounds_and_type_check(host_span cols, rmm::cuda_stream_v "Total number of concatenated rows exceeds the column size limit", std::overflow_error); + if (std::any_of(cols.begin(), cols.end(), [](column_view const& c) { + return c.type().id() == cudf::type_id::EMPTY; + })) { + CUDF_EXPECTS( + std::all_of(cols.begin(), + cols.end(), + [](column_view const& c) { return c.type().id() == cudf::type_id::EMPTY; }), + "Mismatch in columns to concatenate.", + cudf::data_type_error); + return; + } + CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()), + "Type mismatch in columns to concatenate.", + cudf::data_type_error); + // traverse children cudf::type_dispatcher(cols.front().type(), traverse_children{}, cols, stream); } @@ -498,6 +509,15 @@ std::unique_ptr concatenate(host_span columns_to_conc return empty_like(columns_to_concat.front()); } + // For empty columns, we can just create an EMPTY column of the appropriate length. + if (columns_to_concat.front().type().id() == cudf::type_id::EMPTY) { + auto length = std::accumulate( + columns_to_concat.begin(), columns_to_concat.end(), 0, [](auto a, auto const& b) { + return a + b.size(); + }); + return std::make_unique( + data_type(type_id::EMPTY), length, rmm::device_buffer{}, rmm::device_buffer{}, length); + } return type_dispatcher( columns_to_concat.front().type(), concatenate_dispatch{columns_to_concat, stream, mr}); } diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp index dd9e9600a87..605d813ed1e 100644 --- a/cpp/src/interop/arrow_utilities.cpp +++ b/cpp/src/interop/arrow_utilities.cpp @@ -39,7 +39,8 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view) case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32); case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64); case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS); - case NANOARROW_TYPE_STRING: return data_type(type_id::STRING); + case NANOARROW_TYPE_STRING: + case NANOARROW_TYPE_LARGE_STRING: return data_type(type_id::STRING); case NANOARROW_TYPE_LIST: return data_type(type_id::LIST); case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32); case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT); diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu index 002a8ec1f14..73c1a474310 100644 --- a/cpp/src/interop/from_arrow_device.cu +++ b/cpp/src/interop/from_arrow_device.cu @@ -143,6 +143,9 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING, + "Large strings are not yet supported in from_arrow_device", + cudf::data_type_error); if (input->length == 0) { return std::make_tuple( {type, diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu index 854a1d68fdc..b7e07056686 100644 --- a/cpp/src/interop/from_arrow_host.cu +++ b/cpp/src/interop/from_arrow_host.cu @@ -188,8 +188,16 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()(offset_buffers[1])[input->length + input->offset]; + int64_t const char_data_length = [&]() { + if (schema->type == NANOARROW_TYPE_LARGE_STRING) { + return reinterpret_cast(offset_buffers[1])[input->length + input->offset]; + } else if (schema->type == NANOARROW_TYPE_STRING) { + return static_cast( + reinterpret_cast(offset_buffers[1])[input->length + input->offset]); + } else { + CUDF_FAIL("Unsupported string type", cudf::data_type_error); + } + }(); void const* char_buffers[2] = {nullptr, input->buffers[2]}; ArrowArray char_array = { .length = char_data_length, @@ -210,15 +218,27 @@ std::unique_ptr dispatch_copy_from_arrow_host::operator()operator()(&view, &offsets_array, data_type(type_id::INT32), true); + auto offsets_column = [&]() { + if (schema->type == NANOARROW_TYPE_LARGE_STRING) { + return this->operator()(&view, &offsets_array, data_type(type_id::INT64), true); + } else if (schema->type == NANOARROW_TYPE_STRING) { + return this->operator()(&view, &offsets_array, data_type(type_id::INT32), true); + } else { + CUDF_FAIL("Unsupported string type", cudf::data_type_error); + } + }(); NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr)); - auto chars_column = this->operator()(&view, &char_array, data_type(type_id::INT8), true); + rmm::device_buffer chars(char_data_length, stream, mr); + CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(), + reinterpret_cast(char_array.buffers[1]), + chars.size(), + cudaMemcpyDefault, + stream.value())); auto const num_rows = offsets_column->size() - 1; auto out_col = make_strings_column(num_rows, std::move(offsets_column), - std::move(chars_column->release().data.release()[0]), + std::move(chars), input->null_count, std::move(*get_mask_buffer(input))); diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu new file mode 100644 index 00000000000..0c85b561944 --- /dev/null +++ b/cpp/src/interop/from_arrow_stream.cu @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "arrow_utilities.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +namespace cudf { +namespace detail { + +namespace { + +std::unique_ptr make_empty_column_from_schema(ArrowSchema const* schema, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + ArrowSchemaView schema_view; + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr)); + + auto const type{arrow_to_cudf_type(&schema_view)}; + switch (type.id()) { + case type_id::EMPTY: { + return std::make_unique( + data_type(type_id::EMPTY), 0, rmm::device_buffer{}, rmm::device_buffer{}, 0); + } + case type_id::LIST: { + return cudf::make_lists_column(0, + cudf::make_empty_column(data_type{type_id::INT32}), + make_empty_column_from_schema(schema->children[0], stream, mr), + 0, + {}, + stream, + mr); + } + case type_id::STRUCT: { + std::vector> child_columns; + child_columns.reserve(schema->n_children); + std::transform( + schema->children, + schema->children + schema->n_children, + std::back_inserter(child_columns), + [&](auto const& child) { return make_empty_column_from_schema(child, stream, mr); }); + return cudf::make_structs_column(0, std::move(child_columns), 0, {}, stream, mr); + } + default: { + return cudf::make_empty_column(type); + } + } +} + +} // namespace + +std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(input != nullptr, "input ArrowArrayStream must not be NULL", std::invalid_argument); + + // Potential future optimization: Since the from_arrow API accepts an + // ArrowSchema we're allocating one here instead of using a view, which we + // could avoid with a different underlying implementation. + ArrowSchema schema; + NANOARROW_THROW_NOT_OK(ArrowArrayStreamGetSchema(input, &schema, nullptr)); + + std::vector> chunks; + ArrowArray chunk; + while (true) { + NANOARROW_THROW_NOT_OK(ArrowArrayStreamGetNext(input, &chunk, nullptr)); + if (chunk.release == nullptr) { break; } + chunks.push_back(from_arrow(&schema, &chunk, stream, mr)); + chunk.release(&chunk); + } + input->release(input); + + if (chunks.empty()) { + if (schema.n_children == 0) { + schema.release(&schema); + return std::make_unique(); + } + + // If there are no chunks but the schema has children, we need to construct a suitable empty + // table. + std::vector> columns; + columns.reserve(chunks.size()); + std::transform( + schema.children, + schema.children + schema.n_children, + std::back_inserter(columns), + [&](auto const& child) { return make_empty_column_from_schema(child, stream, mr); }); + schema.release(&schema); + return std::make_unique(std::move(columns)); + } + + schema.release(&schema); + + auto chunk_views = std::vector{}; + chunk_views.reserve(chunks.size()); + std::transform( + chunks.begin(), chunks.end(), std::back_inserter(chunk_views), [](auto const& chunk) { + return chunk->view(); + }); + return cudf::detail::concatenate(chunk_views, stream, mr); +} + +} // namespace detail + +std::unique_ptr
from_arrow_stream(ArrowArrayStream* input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::from_arrow_stream(input, stream, mr); +} +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 244bcb7d897..0eab9ba61d8 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -273,6 +273,7 @@ ConfigureTest( interop/from_arrow_test.cpp interop/from_arrow_device_test.cpp interop/from_arrow_host_test.cpp + interop/from_arrow_stream_test.cpp interop/dlpack_test.cpp EXTRA_LIB nanoarrow diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp index 054441788d0..18140c34abd 100644 --- a/cpp/tests/copying/concatenate_tests.cpp +++ b/cpp/tests/copying/concatenate_tests.cpp @@ -1667,3 +1667,63 @@ TEST_F(DictionaryConcatTest, ErrorsTest) std::vector empty; EXPECT_THROW(cudf::concatenate(empty), cudf::logic_error); } + +struct EmptyColumnTest : public cudf::test::BaseFixture {}; + +TEST_F(EmptyColumnTest, SimpleTest) +{ + std::vector columns; + constexpr auto num_copies = 10; + constexpr auto num_rows = 10; + for (auto i = 0; i < num_copies; ++i) { + columns.emplace_back(cudf::data_type(cudf::type_id::EMPTY), + num_rows, + rmm::device_buffer{}, + rmm::device_buffer{}, + 0); + } + + // Create views from columns + std::vector views; + for (auto& col : columns) { + views.push_back(col.view()); + } + auto result = cudf::concatenate(views); + + ASSERT_EQ(result->size(), num_copies * num_rows); + ASSERT_EQ(result->type().id(), cudf::type_id::EMPTY); +} + +struct TableOfEmptyColumnsTest : public cudf::test::BaseFixture {}; + +TEST_F(TableOfEmptyColumnsTest, SimpleTest) +{ + std::vector tables; + constexpr auto num_copies = 10; + constexpr auto num_rows = 10; + constexpr auto num_columns = 10; + for (auto i = 0; i < num_copies; ++i) { + std::vector> columns; + for (auto j = 0; j < num_columns; ++j) { + columns.push_back(std::make_unique(cudf::data_type(cudf::type_id::EMPTY), + num_rows, + rmm::device_buffer{}, + rmm::device_buffer{}, + 0)); + } + tables.emplace_back(std::move(columns)); + } + + // Create views from columns + std::vector views; + for (auto& tbl : tables) { + views.push_back(tbl.view()); + } + auto result = cudf::concatenate(views); + + ASSERT_EQ(result->num_rows(), num_copies * num_rows); + ASSERT_EQ(result->num_columns(), num_columns); + for (auto i = 0; i < num_columns; ++i) { + ASSERT_EQ(result->get_column(i).type().id(), cudf::type_id::EMPTY); + } +} diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp new file mode 100644 index 00000000000..418ec057303 --- /dev/null +++ b/cpp/tests/interop/from_arrow_stream_test.cpp @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nanoarrow_utils.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct VectorOfArrays { + std::vector arrays; + nanoarrow::UniqueSchema schema; + size_t index{0}; + + static int get_schema(ArrowArrayStream* stream, ArrowSchema* out_schema) + { + auto private_data = static_cast(stream->private_data); + ArrowSchemaDeepCopy(private_data->schema.get(), out_schema); + return 0; + } + + static int get_next(ArrowArrayStream* stream, ArrowArray* out_array) + { + auto private_data = static_cast(stream->private_data); + if (private_data->index >= private_data->arrays.size()) { + out_array->release = nullptr; + return 0; + } + ArrowArrayMove(private_data->arrays[private_data->index++].get(), out_array); + return 0; + } + + static const char* get_last_error(ArrowArrayStream* stream) { return nullptr; } + + static void release(ArrowArrayStream* stream) + { + delete static_cast(stream->private_data); + } +}; + +struct FromArrowStreamTest : public cudf::test::BaseFixture {}; + +void makeStreamFromArrays(std::vector arrays, + nanoarrow::UniqueSchema schema, + ArrowArrayStream* out) +{ + auto* private_data = new VectorOfArrays{std::move(arrays), std::move(schema)}; + out->get_schema = VectorOfArrays::get_schema; + out->get_next = VectorOfArrays::get_next; + out->get_last_error = VectorOfArrays::get_last_error; + out->release = VectorOfArrays::release; + out->private_data = private_data; +} + +TEST_F(FromArrowStreamTest, BasicTest) +{ + constexpr auto num_copies = 3; + std::vector> tables; + // The schema is unique across all tables. + nanoarrow::UniqueSchema schema; + std::vector arrays; + for (auto i = 0; i < num_copies; ++i) { + auto [tbl, sch, arr] = get_nanoarrow_host_tables(0); + tables.push_back(std::move(tbl)); + arrays.push_back(std::move(arr)); + if (i == 0) { sch.move(schema.get()); } + } + std::vector table_views; + for (auto const& table : tables) { + table_views.push_back(table->view()); + } + auto expected = cudf::concatenate(table_views); + + ArrowArrayStream stream; + makeStreamFromArrays(std::move(arrays), std::move(schema), &stream); + auto result = cudf::from_arrow_stream(&stream); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result->view()); +} + +TEST_F(FromArrowStreamTest, EmptyTest) +{ + auto [tbl, sch, arr] = get_nanoarrow_host_tables(0); + std::vector table_views{tbl->view()}; + auto expected = cudf::concatenate(table_views); + + ArrowArrayStream stream; + makeStreamFromArrays({}, std::move(sch), &stream); + auto result = cudf::from_arrow_stream(&stream); + cudf::have_same_types(expected->view(), result->view()); +} diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp index 94c4372e74a..4147728b2a6 100644 --- a/cpp/tests/interop/nanoarrow_utils.hpp +++ b/cpp/tests/interop/nanoarrow_utils.hpp @@ -375,3 +375,6 @@ nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list data, std::tuple, nanoarrow::UniqueSchema, generated_test_data> get_nanoarrow_cudf_table(cudf::size_type length); + +std::tuple, nanoarrow::UniqueSchema, nanoarrow::UniqueArray> +get_nanoarrow_host_tables(cudf::size_type length); diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index 07e9d1ead11..adf7e1fd7e8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from cpython cimport pycapsule from cython.operator cimport dereference from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move @@ -11,9 +12,15 @@ from functools import singledispatch from pyarrow import lib as pa +from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.interop cimport ( + ArrowArray, + ArrowArrayStream, + ArrowSchema, column_metadata, from_arrow as cpp_from_arrow, + from_arrow_column as cpp_from_arrow_column, + from_arrow_stream as cpp_from_arrow_stream, to_arrow as cpp_to_arrow, ) from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport ( @@ -124,11 +131,15 @@ def _from_arrow_datatype(pyarrow_object): def _from_arrow_table(pyarrow_object, *, DataType data_type=None): if data_type is not None: raise ValueError("data_type may not be passed for tables") - cdef shared_ptr[pa.CTable] arrow_table = pa.pyarrow_unwrap_table(pyarrow_object) + stream = pyarrow_object.__arrow_c_stream__() + cdef ArrowArrayStream* c_stream = ( + pycapsule.PyCapsule_GetPointer(stream, "arrow_array_stream") + ) cdef unique_ptr[table] c_result with nogil: - c_result = move(cpp_from_arrow(dereference(arrow_table))) + # The libcudf function here will release the stream. + c_result = move(cpp_from_arrow_stream(c_stream)) return Table.from_libcudf(move(c_result)) @@ -190,8 +201,25 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None): def _from_arrow_column(pyarrow_object, *, DataType data_type=None): if data_type is not None: raise ValueError("data_type may not be passed for arrays") - pa_table = pa.table([pyarrow_object], [""]) - return from_arrow(pa_table).columns()[0] + + schema, array = pyarrow_object.__arrow_c_array__() + cdef ArrowSchema* c_schema = ( + pycapsule.PyCapsule_GetPointer(schema, "arrow_schema") + ) + cdef ArrowArray* c_array = ( + pycapsule.PyCapsule_GetPointer(array, "arrow_array") + ) + + cdef unique_ptr[column] c_result + with nogil: + c_result = move(cpp_from_arrow_column(c_schema, c_array)) + + # The capsule destructors should release automatically for us, but we + # choose to do it explicitly here for clarity. + c_schema.release(c_schema) + c_array.release(c_array) + + return Column.from_libcudf(move(c_result)) @singledispatch diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd index 471b78505fb..2151da28d4b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd @@ -7,6 +7,7 @@ from pyarrow.lib cimport CScalar, CTable from cudf._lib.types import cudf_to_np_types, np_to_cudf_types +from cudf._lib.pylibcudf.libcudf.column.column cimport column from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar from cudf._lib.pylibcudf.libcudf.table.table cimport table from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view @@ -16,6 +17,19 @@ cdef extern from "dlpack/dlpack.h" nogil: ctypedef struct DLManagedTensor: void(*deleter)(DLManagedTensor*) except + + +# The Arrow structs are not namespaced. +cdef extern from "cudf/interop.hpp" nogil: + cdef struct ArrowSchema: + void (*release)(ArrowSchema*) noexcept nogil + + cdef struct ArrowArray: + void (*release)(ArrowArray*) noexcept nogil + + cdef struct ArrowArrayStream: + void (*release)(ArrowArrayStream*) noexcept nogil + + cdef extern from "cudf/interop.hpp" namespace "cudf" \ nogil: cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor @@ -42,3 +56,9 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \ const scalar& input, column_metadata metadata, ) except + + + cdef unique_ptr[table] from_arrow_stream(ArrowArrayStream* input) except + + cdef unique_ptr[column] from_arrow_column( + const ArrowSchema* schema, + const ArrowArray* input + ) except + diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index f2501041f25..8ed78d804bf 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2757,8 +2757,6 @@ def test_series_from_large_string(pa_type): assert_eq(expected, got) - assert pa_string_array.equals(got.to_arrow()) - @pytest.mark.parametrize( "scalar",