From c27b915c9ee6c5a97154d8b35da9118cdf706091 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 14 Jul 2023 13:18:28 -0700 Subject: [PATCH 01/11] move arrow_io_source to separate header --- conda/recipes/libcudf/meta.yaml | 1 + cpp/include/cudf/io/arrow_io_source.hpp | 164 ++++++++++++++++++++++++ cpp/include/cudf/io/datasource.hpp | 147 +-------------------- cpp/src/io/utilities/datasource.cpp | 1 + cpp/tests/io/arrow_io_source_test.cpp | 2 +- cpp/tests/io/csv_test.cpp | 2 +- cpp/tests/io/json_test.cpp | 2 +- 7 files changed, 176 insertions(+), 143 deletions(-) create mode 100644 cpp/include/cudf/io/arrow_io_source.hpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 4e9b5e2fdc1..45f4a742165 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -193,6 +193,7 @@ outputs: - test -f $PREFIX/include/cudf/groupby.hpp - test -f $PREFIX/include/cudf/hashing.hpp - test -f $PREFIX/include/cudf/interop.hpp + - test -f $PREFIX/include/cudf/io/arrow_io_source.hpp - test -f $PREFIX/include/cudf/io/avro.hpp - test -f $PREFIX/include/cudf/io/csv.hpp - test -f $PREFIX/include/cudf/io/data_sink.hpp diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp new file mode 100644 index 00000000000..d0b9d6795e7 --- /dev/null +++ b/cpp/include/cudf/io/arrow_io_source.hpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "datasource.hpp" + +#include + +// We disable warning 611 because some Arrow subclasses of +// `arrow::fs::FileSystem` only partially override the `Equals` method, +// triggering warning 611-D from nvcc. +#ifdef __CUDACC__ +#pragma nv_diag_suppress 611 +#endif +#include +#include +#ifdef __CUDACC__ +#pragma nv_diag_default 611 +#endif + +// We disable warning 2810 to workaround the compile issue (warning treated as error): +// result.h(263): error #2810-D: ignoring return value type with "nodiscard" attribute +#ifdef __CUDACC__ +#pragma nv_diag_suppress 2810 +#endif +#include +#ifdef __CUDACC__ +#pragma nv_diag_default 2810 +#endif + +#include +#include +#include +#include + +#include +#include + +namespace cudf::io { + +/** + * @addtogroup io_datasources + * @{ + * @file + */ + +/** + * @brief Implementation class for reading from an Apache Arrow file. The file + * could be a memory-mapped file or other implementation supported by Arrow. + */ +class arrow_io_source : public datasource { + /** + * @brief Implementation for an owning buffer where `arrow::Buffer` holds the data. + */ + class arrow_io_buffer : public buffer { + std::shared_ptr arrow_buffer; + + public: + explicit arrow_io_buffer(std::shared_ptr arrow_buffer) + : arrow_buffer(arrow_buffer) + { + } + [[nodiscard]] size_t size() const override { return arrow_buffer->size(); } + [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); } + }; + + public: + /** + * @brief Constructs an object from an Apache Arrow Filesystem URI + * + * @param arrow_uri Apache Arrow Filesystem URI + */ + explicit arrow_io_source(std::string_view arrow_uri) + { + std::string const uri_start_delimiter = "//"; + std::string const uri_end_delimiter = "?"; + + arrow::Result> result = + arrow::fs::FileSystemFromUri(static_cast(arrow_uri)); + CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI."); + filesystem = result.ValueOrDie(); + + // Parse the path from the URI + size_t start = arrow_uri.find(uri_start_delimiter) == std::string::npos + ? 0 + : arrow_uri.find(uri_start_delimiter) + uri_start_delimiter.size(); + size_t end = arrow_uri.find(uri_end_delimiter) - start; + std::string_view path = arrow_uri.substr(start, end); + + arrow::Result> in_stream = + filesystem->OpenInputFile(static_cast(path).c_str()); + CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile"); + arrow_file = in_stream.ValueOrDie(); + } + + /** + * @brief Constructs an object from an `arrow` source object. + * + * @param file The `arrow` object from which the data is read + */ + explicit arrow_io_source(std::shared_ptr file) : arrow_file(file) {} + + /** + * @brief Returns a buffer with a subset of data from the `arrow` source. + * + * @param offset The offset in bytes from which to read + * @param size The number of bytes to read + * @return A buffer with the read data + */ + std::unique_ptr host_read(size_t offset, size_t size) override + { + auto result = arrow_file->ReadAt(offset, size); + CUDF_EXPECTS(result.ok(), "Cannot read file data"); + return std::make_unique(result.ValueOrDie()); + } + + /** + * @brief Reads a selected range from the `arrow` source into a preallocated buffer. + * + * @param[in] offset The offset in bytes from which to read + * @param[in] size The number of bytes to read + * @param[out] dst The preallocated buffer to read into + * @return The number of bytes read + */ + size_t host_read(size_t offset, size_t size, uint8_t* dst) override + { + auto result = arrow_file->ReadAt(offset, size, dst); + CUDF_EXPECTS(result.ok(), "Cannot read file data"); + return result.ValueOrDie(); + } + + /** + * @brief Returns the size of the data in the `arrow` source. + * + * @return The size of the data in the `arrow` source + */ + [[nodiscard]] size_t size() const override + { + auto result = arrow_file->GetSize(); + CUDF_EXPECTS(result.ok(), "Cannot get file size"); + return result.ValueOrDie(); + } + + private: + std::shared_ptr filesystem; + std::shared_ptr arrow_file; +}; + +/** @} */ // end of group +} // namespace cudf::io diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp index 1d4943d9826..28263d466f3 100644 --- a/cpp/include/cudf/io/datasource.hpp +++ b/cpp/include/cudf/io/datasource.hpp @@ -22,35 +22,6 @@ #include -#include - -// We disable warning 611 because some Arrow subclasses of -// `arrow::fs::FileSystem` only partially override the `Equals` method, -// triggering warning 611-D from nvcc. -#ifdef __CUDACC__ -#pragma nv_diag_suppress 611 -#endif -#include -#include -#ifdef __CUDACC__ -#pragma nv_diag_default 611 -#endif - -// We disable warning 2810 to workaround the compile issue (warning treated as error): -// result.h(263): error #2810-D: ignoring return value type with "nodiscard" attribute -#ifdef __CUDACC__ -#pragma nv_diag_suppress 2810 -#endif -#include -#ifdef __CUDACC__ -#pragma nv_diag_default 2810 -#endif - -#include -#include -#include -#include - #include #include @@ -58,6 +29,12 @@ namespace cudf { //! IO interfaces namespace io { +/** + * @addtogroup io_datasources + * @{ + * @file + */ + /** * @brief Interface class for providing input data to the readers. */ @@ -143,15 +120,6 @@ class datasource { */ static std::unique_ptr create(cudf::device_span buffer); - /** - * @brief Creates a source from a from an Arrow file. - * - * @param[in] arrow_file RandomAccessFile to which the API calls are forwarded - * @return Constructed datasource object - */ - static std::unique_ptr create( - std::shared_ptr arrow_file); - /** * @brief Creates a source from an user implemented datasource object. * @@ -406,107 +374,6 @@ class datasource { }; }; -/** - * @brief Implementation class for reading from an Apache Arrow file. The file - * could be a memory-mapped file or other implementation supported by Arrow. - */ -class arrow_io_source : public datasource { - /** - * @brief Implementation for an owning buffer where `arrow::Buffer` holds the data. - */ - class arrow_io_buffer : public buffer { - std::shared_ptr arrow_buffer; - - public: - explicit arrow_io_buffer(std::shared_ptr arrow_buffer) - : arrow_buffer(arrow_buffer) - { - } - [[nodiscard]] size_t size() const override { return arrow_buffer->size(); } - [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); } - }; - - public: - /** - * @brief Constructs an object from an Apache Arrow Filesystem URI - * - * @param arrow_uri Apache Arrow Filesystem URI - */ - explicit arrow_io_source(std::string_view arrow_uri) - { - std::string const uri_start_delimiter = "//"; - std::string const uri_end_delimiter = "?"; - - arrow::Result> result = - arrow::fs::FileSystemFromUri(static_cast(arrow_uri)); - CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI."); - filesystem = result.ValueOrDie(); - - // Parse the path from the URI - size_t start = arrow_uri.find(uri_start_delimiter) == std::string::npos - ? 0 - : arrow_uri.find(uri_start_delimiter) + uri_start_delimiter.size(); - size_t end = arrow_uri.find(uri_end_delimiter) - start; - std::string_view path = arrow_uri.substr(start, end); - - arrow::Result> in_stream = - filesystem->OpenInputFile(static_cast(path).c_str()); - CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile"); - arrow_file = in_stream.ValueOrDie(); - } - - /** - * @brief Constructs an object from an `arrow` source object. - * - * @param file The `arrow` object from which the data is read - */ - explicit arrow_io_source(std::shared_ptr file) : arrow_file(file) {} - - /** - * @brief Returns a buffer with a subset of data from the `arrow` source. - * - * @param offset The offset in bytes from which to read - * @param size The number of bytes to read - * @return A buffer with the read data - */ - std::unique_ptr host_read(size_t offset, size_t size) override - { - auto result = arrow_file->ReadAt(offset, size); - CUDF_EXPECTS(result.ok(), "Cannot read file data"); - return std::make_unique(result.ValueOrDie()); - } - - /** - * @brief Reads a selected range from the `arrow` source into a preallocated buffer. - * - * @param[in] offset The offset in bytes from which to read - * @param[in] size The number of bytes to read - * @param[out] dst The preallocated buffer to read into - * @return The number of bytes read - */ - size_t host_read(size_t offset, size_t size, uint8_t* dst) override - { - auto result = arrow_file->ReadAt(offset, size, dst); - CUDF_EXPECTS(result.ok(), "Cannot read file data"); - return result.ValueOrDie(); - } - - /** - * @brief Returns the size of the data in the `arrow` source. - * - * @return The size of the data in the `arrow` source - */ - [[nodiscard]] size_t size() const override - { - auto result = arrow_file->GetSize(); - CUDF_EXPECTS(result.ok(), "Cannot get file size"); - return result.ValueOrDie(); - } - - private: - std::shared_ptr filesystem; - std::shared_ptr arrow_file; -}; - +/** @} */ // end of group } // namespace io } // namespace cudf diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 8aea8b4f69c..e282b153c31 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -17,6 +17,7 @@ #include "file_io_utilities.hpp" #include +#include #include #include #include diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp index 2961deec384..0c9696781bd 100644 --- a/cpp/tests/io/arrow_io_source_test.cpp +++ b/cpp/tests/io/arrow_io_source_test.cpp @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 9da97c00712..080cd223c24 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -23,8 +23,8 @@ #include #include +#include #include -#include #include #include #include diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index e4d52a2953e..c1e2f831480 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include #include From f44947ed2d18c409a3828de9a955ebffcd2bbcc0 Mon Sep 17 00:00:00 2001 From: vuule Date: Fri, 14 Jul 2023 13:18:46 -0700 Subject: [PATCH 02/11] remove unused members --- cpp/include/cudf/io/types.hpp | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index b08c50574b6..9b0dcff99af 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -32,13 +32,6 @@ #include #include -// Forward declarations -namespace arrow { -namespace io { -class RandomAccessFile; -} -} // namespace arrow - namespace cudf { //! IO interfaces namespace io { @@ -286,8 +279,6 @@ constexpr inline auto is_byte_like_type() * @brief Source information for read interfaces */ struct source_info { - std::vector> _files; //!< Input files - source_info() = default; /** @@ -438,12 +429,6 @@ struct source_info { * @return The device buffers of the input */ [[nodiscard]] auto const& device_buffers() const { return _device_buffers; } - /** - * @brief Get the input files - * - * @return The input files - */ - [[nodiscard]] auto const& files() const { return _files; } /** * @brief Get the user sources of the input * From 650811f20dea691596d00555b36a33369ae545f1 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 17 Jul 2023 15:20:31 -0700 Subject: [PATCH 03/11] python working? --- python/cudf/cudf/_lib/cpp/io/types.pxd | 5 +++++ python/cudf/cudf/_lib/io/datasource.pxd | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd index b2b0a77c45f..096f18037fb 100644 --- a/python/cudf/cudf/_lib/cpp/io/types.pxd +++ b/python/cudf/cudf/_lib/cpp/io/types.pxd @@ -128,12 +128,17 @@ cdef extern from "cudf/io/data_sink.hpp" \ cdef cppclass data_sink: pass +# TODO move to separate pxd file cdef extern from "cudf/io/datasource.hpp" \ namespace "cudf::io" nogil: cdef cppclass datasource: pass +# TODO move to new pxd file +cdef extern from "cudf/io/arrow_io_source.hpp" \ + namespace "cudf::io" nogil: + cdef cppclass arrow_io_source(datasource): arrow_io_source(string arrow_uri) except + arrow_io_source(shared_ptr[CRandomAccessFile]) except + diff --git a/python/cudf/cudf/_lib/io/datasource.pxd b/python/cudf/cudf/_lib/io/datasource.pxd index a7a3731a0e6..4c290040d1b 100644 --- a/python/cudf/cudf/_lib/io/datasource.pxd +++ b/python/cudf/cudf/_lib/io/datasource.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp.memory cimport shared_ptr @@ -8,6 +8,7 @@ from cudf._lib.cpp.io.types cimport arrow_io_source, datasource cdef class Datasource: cdef datasource* get_datasource(self) nogil except * +# TODO move to new pxd file cdef class NativeFileDatasource(Datasource): cdef shared_ptr[arrow_io_source] c_datasource cdef datasource* get_datasource(self) nogil From 4cdd6ddf1fd720820d610a75e0f3a1a1e1cb7320 Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 2 Aug 2023 13:22:35 -0700 Subject: [PATCH 04/11] cython updates/cleanup --- .../cudf/cudf/_lib/cpp/io/arrow_io_source.pxd | 15 ++++++++ python/cudf/cudf/_lib/cpp/io/data_sink.pxd | 8 +++++ python/cudf/cudf/_lib/cpp/io/datasource.pxd | 8 +++++ python/cudf/cudf/_lib/cpp/io/types.pxd | 34 ++++--------------- python/cudf/cudf/_lib/io/datasource.pxd | 5 +-- python/cudf/cudf/_lib/io/datasource.pyx | 5 +-- python/cudf/cudf/_lib/io/utils.pxd | 10 ++---- python/cudf/cudf/_lib/io/utils.pyx | 6 ++-- python/cudf/cudf/_lib/parquet.pyx | 5 +-- 9 files changed, 53 insertions(+), 43 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd create mode 100644 python/cudf/cudf/_lib/cpp/io/data_sink.pxd create mode 100644 python/cudf/cudf/_lib/cpp/io/datasource.pxd diff --git a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd b/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd new file mode 100644 index 00000000000..17d5be8e47b --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp.memory cimport shared_ptr +from libcpp.string cimport string +from pyarrow.includes.libarrow cimport CRandomAccessFile + +cimport cudf._lib.cpp.io.datasource as cudf_io_datasource + + +cdef extern from "cudf/io/arrow_io_source.hpp" \ + namespace "cudf::io" nogil: + + cdef cppclass arrow_io_source(cudf_io_datasource.datasource): + arrow_io_source(string arrow_uri) except + + arrow_io_source(shared_ptr[CRandomAccessFile]) except + diff --git a/python/cudf/cudf/_lib/cpp/io/data_sink.pxd b/python/cudf/cudf/_lib/cpp/io/data_sink.pxd new file mode 100644 index 00000000000..e939a47d7f9 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/io/data_sink.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + + +cdef extern from "cudf/io/data_sink.hpp" \ + namespace "cudf::io" nogil: + + cdef cppclass data_sink: + pass diff --git a/python/cudf/cudf/_lib/cpp/io/datasource.pxd b/python/cudf/cudf/_lib/cpp/io/datasource.pxd new file mode 100644 index 00000000000..c69aa65bd3c --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/io/datasource.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + + +cdef extern from "cudf/io/datasource.hpp" \ + namespace "cudf::io" nogil: + + cdef cppclass datasource: + pass diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd index 096f18037fb..01eaca82692 100644 --- a/python/cudf/cudf/_lib/cpp/io/types.pxd +++ b/python/cudf/cudf/_lib/cpp/io/types.pxd @@ -10,6 +10,8 @@ from libcpp.unordered_map cimport unordered_map from libcpp.vector cimport vector from pyarrow.includes.libarrow cimport CRandomAccessFile +cimport cudf._lib.cpp.io.data_sink as cudf_io_data_sink +cimport cudf._lib.cpp.io.datasource as cudf_io_datasource cimport cudf._lib.cpp.table.table_view as cudf_table_view from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.types cimport size_type @@ -105,40 +107,18 @@ cdef extern from "cudf/io/types.hpp" \ source_info() except + source_info(const vector[string] &filepaths) except + source_info(const vector[host_buffer] &host_buffers) except + - source_info(datasource *source) except + - source_info(const vector[datasource*] &datasources) except + + source_info(cudf_io_datasource.datasource *source) except + + source_info(const vector[cudf_io_datasource.datasource*] &datasources) except + cdef cppclass sink_info: io_type type const vector[string]& filepaths() const vector[vector[char] *]& buffers() - const vector[data_sink *]& user_sinks() + const vector[cudf_io_data_sink.data_sink *]& user_sinks() sink_info() except + sink_info(string file_path) except + sink_info(vector[string] file_path) except + sink_info(vector[char] * buffer) except + - sink_info(data_sink * user_sink) except + - sink_info(vector[data_sink *] user_sink) except + - - -cdef extern from "cudf/io/data_sink.hpp" \ - namespace "cudf::io" nogil: - - cdef cppclass data_sink: - pass - -# TODO move to separate pxd file -cdef extern from "cudf/io/datasource.hpp" \ - namespace "cudf::io" nogil: - - cdef cppclass datasource: - pass - -# TODO move to new pxd file -cdef extern from "cudf/io/arrow_io_source.hpp" \ - namespace "cudf::io" nogil: - - cdef cppclass arrow_io_source(datasource): - arrow_io_source(string arrow_uri) except + - arrow_io_source(shared_ptr[CRandomAccessFile]) except + + sink_info(cudf_io_data_sink.data_sink * user_sink) except + + sink_info(vector[cudf_io_data_sink.data_sink *] user_sink) except + diff --git a/python/cudf/cudf/_lib/io/datasource.pxd b/python/cudf/cudf/_lib/io/datasource.pxd index 4c290040d1b..108b772a2aa 100644 --- a/python/cudf/cudf/_lib/io/datasource.pxd +++ b/python/cudf/cudf/_lib/io/datasource.pxd @@ -2,13 +2,14 @@ from libcpp.memory cimport shared_ptr -from cudf._lib.cpp.io.types cimport arrow_io_source, datasource +from cudf._lib.cpp.io.arrow_io_source cimport arrow_io_source +from cudf._lib.cpp.io.datasource cimport datasource cdef class Datasource: cdef datasource* get_datasource(self) nogil except * -# TODO move to new pxd file + cdef class NativeFileDatasource(Datasource): cdef shared_ptr[arrow_io_source] c_datasource cdef datasource* get_datasource(self) nogil diff --git a/python/cudf/cudf/_lib/io/datasource.pyx b/python/cudf/cudf/_lib/io/datasource.pyx index 7402779a6ac..0b73a78394c 100644 --- a/python/cudf/cudf/_lib/io/datasource.pyx +++ b/python/cudf/cudf/_lib/io/datasource.pyx @@ -1,10 +1,11 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp.memory cimport shared_ptr from pyarrow.includes.libarrow cimport CRandomAccessFile from pyarrow.lib cimport NativeFile -from cudf._lib.cpp.io.types cimport arrow_io_source, datasource +from cudf._lib.cpp.io.arrow_io_source cimport arrow_io_source +from cudf._lib.cpp.io.datasource cimport datasource cdef class Datasource: diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index af1f2521d4a..2c2d52b512b 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -1,15 +1,11 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector from cudf._lib.column cimport Column -from cudf._lib.cpp.io.types cimport ( - column_name_info, - data_sink, - sink_info, - source_info, -) +from cudf._lib.cpp.io.data_sink cimport data_sink +from cudf._lib.cpp.io.types cimport column_name_info, sink_info, source_info cdef source_info make_source_info(list src) except* diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx index 7dbe395be79..9b027a4d275 100644 --- a/python/cudf/cudf/_lib/io/utils.pyx +++ b/python/cudf/cudf/_lib/io/utils.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cpython.buffer cimport PyBUF_READ from cpython.memoryview cimport PyMemoryView_FromMemory @@ -8,10 +8,10 @@ from libcpp.utility cimport move from libcpp.vector cimport vector from cudf._lib.column cimport Column +from cudf._lib.cpp.io.data_sink cimport data_sink +from cudf._lib.cpp.io.datasource cimport datasource from cudf._lib.cpp.io.types cimport ( column_name_info, - data_sink, - datasource, host_buffer, sink_info, source_info, diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index d297c80ab5a..85fd25cf1a9 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -38,6 +38,7 @@ from libcpp.unordered_map cimport unordered_map from libcpp.utility cimport move from libcpp.vector cimport vector +cimport cudf._lib.cpp.io.data_sink as cudf_io_data_sink cimport cudf._lib.cpp.io.types as cudf_io_types cimport cudf._lib.cpp.types as cudf_types from cudf._lib.column cimport Column @@ -334,7 +335,7 @@ def write_parquet( cdef vector[map[string, string]] user_data cdef table_view tv - cdef vector[unique_ptr[cudf_io_types.data_sink]] _data_sinks + cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sinks cdef cudf_io_types.sink_info sink = make_sinks_info( filepaths_or_buffers, _data_sinks ) @@ -476,7 +477,7 @@ cdef class ParquetWriter: cdef unique_ptr[cpp_parquet_chunked_writer] writer cdef table_input_metadata tbl_meta cdef cudf_io_types.sink_info sink - cdef vector[unique_ptr[cudf_io_types.data_sink]] _data_sink + cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sink cdef cudf_io_types.statistics_freq stat_freq cdef cudf_io_types.compression_type comp_type cdef object index From bb37452350cc48158d430eb67b10318fc00938ab Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 2 Aug 2023 13:57:02 -0700 Subject: [PATCH 05/11] more cython --- python/cudf/cudf/_lib/csv.pyx | 2 +- python/cudf/cudf/_lib/json.pyx | 2 +- python/cudf/cudf/_lib/orc.pyx | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index df6ed89ac7e..6d966a34d8e 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -35,9 +35,9 @@ from cudf._lib.cpp.io.csv cimport ( read_csv as cpp_read_csv, write_csv as cpp_write_csv, ) +from cudf._lib.cpp.io.data_sink cimport data_sink from cudf._lib.cpp.io.types cimport ( compression_type, - data_sink, quote_style, sink_info, source_info, diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index af4232a8734..762d0749ffa 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -17,6 +17,7 @@ from libcpp.utility cimport move from libcpp.vector cimport vector cimport cudf._lib.cpp.io.types as cudf_io_types +from cudf._lib.cpp.io.data_sink cimport data_sink from cudf._lib.cpp.io.json cimport ( json_reader_options, json_writer_options, @@ -27,7 +28,6 @@ from cudf._lib.cpp.io.json cimport ( from cudf._lib.cpp.io.types cimport ( column_name_info, compression_type, - data_sink, sink_info, table_metadata, table_with_metadata, diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index dfe5bcf9d53..0ae039b14d2 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -21,6 +21,7 @@ except ImportError: cimport cudf._lib.cpp.io.types as cudf_io_types from cudf._lib.column cimport Column +from cudf._lib.cpp.io.data_sink cimport data_sink from cudf._lib.cpp.io.orc cimport ( chunked_orc_writer_options, orc_chunked_writer, @@ -36,7 +37,6 @@ from cudf._lib.cpp.io.orc_metadata cimport ( from cudf._lib.cpp.io.types cimport ( column_in_metadata, compression_type, - data_sink, sink_info, source_info, table_input_metadata, From 015f1ac4f619b020951b78c0cec455d5c3f22e1d Mon Sep 17 00:00:00 2001 From: vuule Date: Wed, 2 Aug 2023 14:52:46 -0700 Subject: [PATCH 06/11] bit more... --- python/cudf_kafka/cudf_kafka/_lib/kafka.pxd | 4 ++-- python/cudf_kafka/cudf_kafka/_lib/kafka.pyx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd index e64d8f82739..ca729c62512 100644 --- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd +++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libc.stdint cimport int32_t, int64_t from libcpp cimport bool @@ -7,7 +7,7 @@ from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector -from cudf._lib.cpp.io.types cimport datasource +from cudf._lib.cpp.io.datasource cimport datasource from cudf._lib.io.datasource cimport Datasource diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx index 52278188281..4d732478723 100644 --- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx +++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx @@ -7,7 +7,7 @@ from libcpp.memory cimport unique_ptr from libcpp.string cimport string from libcpp.utility cimport move -from cudf._lib.cpp.io.types cimport datasource +from cudf._lib.cpp.io.datasource cimport datasource from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf_kafka._lib.kafka cimport kafka_consumer From 031ca64f5fab857befc650ed61e0d12aa2d1bf73 Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 7 Aug 2023 10:26:20 -0700 Subject: [PATCH 07/11] remove arrow warnings workarounds --- cpp/include/cudf/io/arrow_io_source.hpp | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp index d0b9d6795e7..93dd9a2f87f 100644 --- a/cpp/include/cudf/io/arrow_io_source.hpp +++ b/cpp/include/cudf/io/arrow_io_source.hpp @@ -20,31 +20,12 @@ #include -// We disable warning 611 because some Arrow subclasses of -// `arrow::fs::FileSystem` only partially override the `Equals` method, -// triggering warning 611-D from nvcc. -#ifdef __CUDACC__ -#pragma nv_diag_suppress 611 -#endif #include #include -#ifdef __CUDACC__ -#pragma nv_diag_default 611 -#endif - -// We disable warning 2810 to workaround the compile issue (warning treated as error): -// result.h(263): error #2810-D: ignoring return value type with "nodiscard" attribute -#ifdef __CUDACC__ -#pragma nv_diag_suppress 2810 -#endif -#include -#ifdef __CUDACC__ -#pragma nv_diag_default 2810 -#endif - #include #include #include +#include #include #include From 077bfd2a487092589530d8f33ead0d414ff5cc8e Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 7 Aug 2023 12:13:51 -0700 Subject: [PATCH 08/11] bit of clean up; fix ctor pxd declaration --- cpp/include/cudf/io/arrow_io_source.hpp | 20 ++++++------------- .../cudf/cudf/_lib/cpp/io/arrow_io_source.pxd | 2 +- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp index 93dd9a2f87f..132b8468bfe 100644 --- a/cpp/include/cudf/io/arrow_io_source.hpp +++ b/cpp/include/cudf/io/arrow_io_source.hpp @@ -19,20 +19,14 @@ #include "datasource.hpp" #include - #include -#include #include -#include #include -#include -#include #include #include namespace cudf::io { - /** * @addtogroup io_datasources * @{ @@ -65,13 +59,12 @@ class arrow_io_source : public datasource { * * @param arrow_uri Apache Arrow Filesystem URI */ - explicit arrow_io_source(std::string_view arrow_uri) + explicit arrow_io_source(std::string const& arrow_uri) { std::string const uri_start_delimiter = "//"; std::string const uri_end_delimiter = "?"; - arrow::Result> result = - arrow::fs::FileSystemFromUri(static_cast(arrow_uri)); + auto const result = arrow::fs::FileSystemFromUri(arrow_uri); CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI."); filesystem = result.ValueOrDie(); @@ -82,8 +75,7 @@ class arrow_io_source : public datasource { size_t end = arrow_uri.find(uri_end_delimiter) - start; std::string_view path = arrow_uri.substr(start, end); - arrow::Result> in_stream = - filesystem->OpenInputFile(static_cast(path).c_str()); + auto const in_stream = filesystem->OpenInputFile(static_cast(path).c_str()); CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile"); arrow_file = in_stream.ValueOrDie(); } @@ -104,7 +96,7 @@ class arrow_io_source : public datasource { */ std::unique_ptr host_read(size_t offset, size_t size) override { - auto result = arrow_file->ReadAt(offset, size); + auto const result = arrow_file->ReadAt(offset, size); CUDF_EXPECTS(result.ok(), "Cannot read file data"); return std::make_unique(result.ValueOrDie()); } @@ -119,7 +111,7 @@ class arrow_io_source : public datasource { */ size_t host_read(size_t offset, size_t size, uint8_t* dst) override { - auto result = arrow_file->ReadAt(offset, size, dst); + auto const result = arrow_file->ReadAt(offset, size, dst); CUDF_EXPECTS(result.ok(), "Cannot read file data"); return result.ValueOrDie(); } @@ -131,7 +123,7 @@ class arrow_io_source : public datasource { */ [[nodiscard]] size_t size() const override { - auto result = arrow_file->GetSize(); + auto const result = arrow_file->GetSize(); CUDF_EXPECTS(result.ok(), "Cannot get file size"); return result.ValueOrDie(); } diff --git a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd b/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd index 17d5be8e47b..4aef4841844 100644 --- a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd +++ b/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd @@ -11,5 +11,5 @@ cdef extern from "cudf/io/arrow_io_source.hpp" \ namespace "cudf::io" nogil: cdef cppclass arrow_io_source(cudf_io_datasource.datasource): - arrow_io_source(string arrow_uri) except + + arrow_io_source(const string& arrow_uri) except + arrow_io_source(shared_ptr[CRandomAccessFile]) except + From cdb18dab559894184731cd847dbd3dca5ea1b31a Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 7 Aug 2023 13:04:53 -0700 Subject: [PATCH 09/11] missing header in tests --- cpp/tests/io/arrow_io_source_test.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp index 0462c6e82d5..a241affecd9 100644 --- a/cpp/tests/io/arrow_io_source_test.cpp +++ b/cpp/tests/io/arrow_io_source_test.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include From 8df80115ad166b1174b8728467741e6e809eeddd Mon Sep 17 00:00:00 2001 From: vuule Date: Mon, 7 Aug 2023 16:43:33 -0700 Subject: [PATCH 10/11] minor clean up; fix pxd declaration --- cpp/include/cudf/io/arrow_io_source.hpp | 26 +++++++------------ cpp/tests/io/arrow_io_source_test.cpp | 7 +++-- .../cudf/cudf/_lib/cpp/io/arrow_io_source.pxd | 2 +- 3 files changed, 14 insertions(+), 21 deletions(-) diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp index 93dd9a2f87f..187e95f0351 100644 --- a/cpp/include/cudf/io/arrow_io_source.hpp +++ b/cpp/include/cudf/io/arrow_io_source.hpp @@ -19,14 +19,9 @@ #include "datasource.hpp" #include - #include -#include #include -#include #include -#include -#include #include #include @@ -65,25 +60,24 @@ class arrow_io_source : public datasource { * * @param arrow_uri Apache Arrow Filesystem URI */ - explicit arrow_io_source(std::string_view arrow_uri) + explicit arrow_io_source(std::string const& arrow_uri) { std::string const uri_start_delimiter = "//"; std::string const uri_end_delimiter = "?"; - arrow::Result> result = - arrow::fs::FileSystemFromUri(static_cast(arrow_uri)); + auto const result = arrow::fs::FileSystemFromUri(arrow_uri); CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI."); filesystem = result.ValueOrDie(); // Parse the path from the URI - size_t start = arrow_uri.find(uri_start_delimiter) == std::string::npos - ? 0 - : arrow_uri.find(uri_start_delimiter) + uri_start_delimiter.size(); - size_t end = arrow_uri.find(uri_end_delimiter) - start; - std::string_view path = arrow_uri.substr(start, end); - - arrow::Result> in_stream = - filesystem->OpenInputFile(static_cast(path).c_str()); + auto const start = [&]() { + auto const delim_start = arrow_uri.find(uri_start_delimiter); + return delim_start == std::string::npos ? 0 : delim_start + uri_start_delimiter.size(); + }(); + auto const end = arrow_uri.find(uri_end_delimiter) - start; + auto const path = arrow_uri.substr(start, end); + + auto const in_stream = filesystem->OpenInputFile(path); CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile"); arrow_file = in_stream.ValueOrDie(); } diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp index 0462c6e82d5..979f8e4fb05 100644 --- a/cpp/tests/io/arrow_io_source_test.cpp +++ b/cpp/tests/io/arrow_io_source_test.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include @@ -48,8 +49,7 @@ TEST_F(ArrowIOTest, URIFileSystem) outfile.close(); std::string file_uri = "file://" + file_name; - std::unique_ptr datasource = - std::make_unique(file_uri); + auto datasource = std::make_unique(file_uri); // Populate the JSON Reader Options cudf::io::json_reader_options options = @@ -72,8 +72,7 @@ TEST_F(ArrowIOTest, S3FileSystem) if (s3_unsupported) { EXPECT_THROW(std::make_unique(s3_uri), cudf::logic_error); } else { - std::unique_ptr datasource = - std::make_unique(s3_uri); + auto datasource = std::make_unique(s3_uri); // Populate the Parquet Reader Options cudf::io::source_info src(datasource.get()); diff --git a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd b/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd index 17d5be8e47b..00de9f19885 100644 --- a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd +++ b/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd @@ -11,5 +11,5 @@ cdef extern from "cudf/io/arrow_io_source.hpp" \ namespace "cudf::io" nogil: cdef cppclass arrow_io_source(cudf_io_datasource.datasource): - arrow_io_source(string arrow_uri) except + + arrow_io_source(const string & arrow_uri) except + arrow_io_source(shared_ptr[CRandomAccessFile]) except + From 0d4c8b04c85a87a5ed34acf77aac91365a106cac Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 10 Aug 2023 00:10:05 -0700 Subject: [PATCH 11/11] separate implementation --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/io/arrow_io_source.hpp | 63 ++---------------- cpp/src/io/utilities/arrow_io_source.cpp | 85 ++++++++++++++++++++++++ cpp/src/io/utilities/datasource.cpp | 2 + 4 files changed, 93 insertions(+), 58 deletions(-) create mode 100644 cpp/src/io/utilities/arrow_io_source.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index d6b2fb10c23..054f3b290a3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -408,6 +408,7 @@ add_library( src/io/text/bgzip_data_chunk_source.cu src/io/text/bgzip_utils.cpp src/io/text/multibyte_split.cu + src/io/utilities/arrow_io_source.cpp src/io/utilities/column_buffer.cpp src/io/utilities/config_utils.cpp src/io/utilities/data_sink.cpp diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp index c57b199f30e..5f79f05c5a1 100644 --- a/cpp/include/cudf/io/arrow_io_source.hpp +++ b/cpp/include/cudf/io/arrow_io_source.hpp @@ -18,10 +18,8 @@ #include "datasource.hpp" -#include #include -#include -#include +#include #include #include @@ -38,48 +36,13 @@ namespace cudf::io { * could be a memory-mapped file or other implementation supported by Arrow. */ class arrow_io_source : public datasource { - /** - * @brief Implementation for an owning buffer where `arrow::Buffer` holds the data. - */ - class arrow_io_buffer : public buffer { - std::shared_ptr arrow_buffer; - - public: - explicit arrow_io_buffer(std::shared_ptr arrow_buffer) - : arrow_buffer(arrow_buffer) - { - } - [[nodiscard]] size_t size() const override { return arrow_buffer->size(); } - [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); } - }; - public: /** * @brief Constructs an object from an Apache Arrow Filesystem URI * * @param arrow_uri Apache Arrow Filesystem URI */ - explicit arrow_io_source(std::string const& arrow_uri) - { - std::string const uri_start_delimiter = "//"; - std::string const uri_end_delimiter = "?"; - - auto const result = arrow::fs::FileSystemFromUri(arrow_uri); - CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI."); - filesystem = result.ValueOrDie(); - - // Parse the path from the URI - auto const start = [&]() { - auto const delim_start = arrow_uri.find(uri_start_delimiter); - return delim_start == std::string::npos ? 0 : delim_start + uri_start_delimiter.size(); - }(); - auto const end = arrow_uri.find(uri_end_delimiter) - start; - auto const path = arrow_uri.substr(start, end); - - auto const in_stream = filesystem->OpenInputFile(path); - CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile"); - arrow_file = in_stream.ValueOrDie(); - } + explicit arrow_io_source(std::string const& arrow_uri); /** * @brief Constructs an object from an `arrow` source object. @@ -95,12 +58,7 @@ class arrow_io_source : public datasource { * @param size The number of bytes to read * @return A buffer with the read data */ - std::unique_ptr host_read(size_t offset, size_t size) override - { - auto const result = arrow_file->ReadAt(offset, size); - CUDF_EXPECTS(result.ok(), "Cannot read file data"); - return std::make_unique(result.ValueOrDie()); - } + std::unique_ptr host_read(size_t offset, size_t size) override; /** * @brief Reads a selected range from the `arrow` source into a preallocated buffer. @@ -110,24 +68,13 @@ class arrow_io_source : public datasource { * @param[out] dst The preallocated buffer to read into * @return The number of bytes read */ - size_t host_read(size_t offset, size_t size, uint8_t* dst) override - { - auto const result = arrow_file->ReadAt(offset, size, dst); - CUDF_EXPECTS(result.ok(), "Cannot read file data"); - return result.ValueOrDie(); - } - + size_t host_read(size_t offset, size_t size, uint8_t* dst) override; /** * @brief Returns the size of the data in the `arrow` source. * * @return The size of the data in the `arrow` source */ - [[nodiscard]] size_t size() const override - { - auto const result = arrow_file->GetSize(); - CUDF_EXPECTS(result.ok(), "Cannot get file size"); - return result.ValueOrDie(); - } + [[nodiscard]] size_t size() const override; private: std::shared_ptr filesystem; diff --git a/cpp/src/io/utilities/arrow_io_source.cpp b/cpp/src/io/utilities/arrow_io_source.cpp new file mode 100644 index 00000000000..d647f3c0a4b --- /dev/null +++ b/cpp/src/io/utilities/arrow_io_source.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include +#include + +namespace cudf::io { + +/** + * @brief Implementation for an owning buffer where `arrow::Buffer` holds the data. + */ +class arrow_io_buffer : public datasource::buffer { + std::shared_ptr arrow_buffer; + + public: + explicit arrow_io_buffer(std::shared_ptr arrow_buffer) : arrow_buffer(arrow_buffer) + { + } + [[nodiscard]] size_t size() const override { return arrow_buffer->size(); } + [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); } +}; + +arrow_io_source::arrow_io_source(std::string const& arrow_uri) +{ + std::string const uri_start_delimiter = "//"; + std::string const uri_end_delimiter = "?"; + + auto const result = arrow::fs::FileSystemFromUri(arrow_uri); + CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI."); + filesystem = result.ValueOrDie(); + + // Parse the path from the URI + auto const start = [&]() { + auto const delim_start = arrow_uri.find(uri_start_delimiter); + return delim_start == std::string::npos ? 0 : delim_start + uri_start_delimiter.size(); + }(); + auto const end = arrow_uri.find(uri_end_delimiter) - start; + auto const path = arrow_uri.substr(start, end); + + auto const in_stream = filesystem->OpenInputFile(path); + CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile"); + arrow_file = in_stream.ValueOrDie(); +} + +std::unique_ptr arrow_io_source::host_read(size_t offset, size_t size) +{ + auto const result = arrow_file->ReadAt(offset, size); + CUDF_EXPECTS(result.ok(), "Cannot read file data"); + return std::make_unique(result.ValueOrDie()); +} + +size_t arrow_io_source::host_read(size_t offset, size_t size, uint8_t* dst) +{ + auto const result = arrow_file->ReadAt(offset, size, dst); + CUDF_EXPECTS(result.ok(), "Cannot read file data"); + return result.ValueOrDie(); +} + +[[nodiscard]] size_t arrow_io_source::size() const +{ + auto const result = arrow_file->GetSize(); + CUDF_EXPECTS(result.ok(), "Cannot get file size"); + return result.ValueOrDie(); +} + +} // namespace cudf::io diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index e282b153c31..6186d9d9736 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -26,6 +26,8 @@ #include #include +#include + #include #include #include