-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Remove Arrow dependency from the datasource.hpp
public header
#13698
Changes from 12 commits
c27b915
f44947e
0bd1d01
650811f
b3a308a
ad1390e
4cdd6dd
bb37452
015f1ac
1a66e27
031ca64
077bfd2
cdb18da
2445bf3
8df8011
722081f
694087e
0d4c8b0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
/* | ||
* Copyright (c) 2023, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include "datasource.hpp" | ||
|
||
#include <arrow/buffer.h> | ||
#include <arrow/filesystem/filesystem.h> | ||
#include <arrow/io/file.h> | ||
#include <arrow/io/memory.h> | ||
|
||
#include <memory> | ||
#include <string> | ||
|
||
namespace cudf::io { | ||
/** | ||
* @addtogroup io_datasources | ||
* @{ | ||
* @file | ||
*/ | ||
|
||
/** | ||
* @brief Implementation class for reading from an Apache Arrow file. The file | ||
* could be a memory-mapped file or other implementation supported by Arrow. | ||
*/ | ||
class arrow_io_source : public datasource { | ||
/** | ||
* @brief Implementation for an owning buffer where `arrow::Buffer` holds the data. | ||
*/ | ||
class arrow_io_buffer : public buffer { | ||
std::shared_ptr<arrow::Buffer> arrow_buffer; | ||
|
||
public: | ||
explicit arrow_io_buffer(std::shared_ptr<arrow::Buffer> arrow_buffer) | ||
: arrow_buffer(arrow_buffer) | ||
{ | ||
} | ||
[[nodiscard]] size_t size() const override { return arrow_buffer->size(); } | ||
[[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); } | ||
}; | ||
|
||
public: | ||
/** | ||
* @brief Constructs an object from an Apache Arrow Filesystem URI | ||
* | ||
* @param arrow_uri Apache Arrow Filesystem URI | ||
*/ | ||
explicit arrow_io_source(std::string const& arrow_uri) | ||
{ | ||
std::string const uri_start_delimiter = "//"; | ||
std::string const uri_end_delimiter = "?"; | ||
|
||
auto const result = arrow::fs::FileSystemFromUri(arrow_uri); | ||
CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI."); | ||
filesystem = result.ValueOrDie(); | ||
|
||
// Parse the path from the URI | ||
size_t start = arrow_uri.find(uri_start_delimiter) == std::string::npos | ||
? 0 | ||
: arrow_uri.find(uri_start_delimiter) + uri_start_delimiter.size(); | ||
size_t end = arrow_uri.find(uri_end_delimiter) - start; | ||
std::string_view path = arrow_uri.substr(start, end); | ||
|
||
auto const in_stream = filesystem->OpenInputFile(static_cast<std::string>(path).c_str()); | ||
CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile"); | ||
arrow_file = in_stream.ValueOrDie(); | ||
} | ||
|
||
/** | ||
* @brief Constructs an object from an `arrow` source object. | ||
* | ||
* @param file The `arrow` object from which the data is read | ||
*/ | ||
explicit arrow_io_source(std::shared_ptr<arrow::io::RandomAccessFile> file) : arrow_file(file) {} | ||
|
||
/** | ||
* @brief Returns a buffer with a subset of data from the `arrow` source. | ||
* | ||
* @param offset The offset in bytes from which to read | ||
* @param size The number of bytes to read | ||
* @return A buffer with the read data | ||
*/ | ||
std::unique_ptr<buffer> host_read(size_t offset, size_t size) override | ||
{ | ||
auto const result = arrow_file->ReadAt(offset, size); | ||
CUDF_EXPECTS(result.ok(), "Cannot read file data"); | ||
return std::make_unique<arrow_io_buffer>(result.ValueOrDie()); | ||
} | ||
|
||
/** | ||
* @brief Reads a selected range from the `arrow` source into a preallocated buffer. | ||
* | ||
* @param[in] offset The offset in bytes from which to read | ||
* @param[in] size The number of bytes to read | ||
* @param[out] dst The preallocated buffer to read into | ||
* @return The number of bytes read | ||
*/ | ||
size_t host_read(size_t offset, size_t size, uint8_t* dst) override | ||
{ | ||
auto const result = arrow_file->ReadAt(offset, size, dst); | ||
CUDF_EXPECTS(result.ok(), "Cannot read file data"); | ||
return result.ValueOrDie(); | ||
} | ||
|
||
/** | ||
* @brief Returns the size of the data in the `arrow` source. | ||
* | ||
* @return The size of the data in the `arrow` source | ||
*/ | ||
[[nodiscard]] size_t size() const override | ||
{ | ||
auto const result = arrow_file->GetSize(); | ||
CUDF_EXPECTS(result.ok(), "Cannot get file size"); | ||
return result.ValueOrDie(); | ||
} | ||
|
||
private: | ||
std::shared_ptr<arrow::fs::FileSystem> filesystem; | ||
std::shared_ptr<arrow::io::RandomAccessFile> arrow_file; | ||
}; | ||
|
||
/** @} */ // end of group | ||
} // namespace cudf::io |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Copyright (c) 2023, NVIDIA CORPORATION. | ||
|
||
from libcpp.memory cimport shared_ptr | ||
from libcpp.string cimport string | ||
from pyarrow.includes.libarrow cimport CRandomAccessFile | ||
|
||
cimport cudf._lib.cpp.io.datasource as cudf_io_datasource | ||
|
||
|
||
cdef extern from "cudf/io/arrow_io_source.hpp" \ | ||
namespace "cudf::io" nogil: | ||
|
||
cdef cppclass arrow_io_source(cudf_io_datasource.datasource): | ||
arrow_io_source(const string& arrow_uri) except + | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. east const. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For some reason I only see "const string" in pxd files so I kept this consistent with other instances. I'm not sure if our east const guideline reached Cython. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cython doesn't support east const. |
||
arrow_io_source(shared_ptr[CRandomAccessFile]) except + |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same to all other member functions. Maybe considering separating class declaration from definition
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I want to keep this class separate from libcudf, but we don't have to take that route.
@davidwendt thoughts on keeping the implementation in libcudf?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure I understand what you mean by keeping this separate from libcudf.
I think Yunsong means moves the implementation to
cpp/src/io/arrow_io_source.cpp
and keep only the declarations in this header file. I agree with that approach.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I meant that I will need to compile cpp/src/io/arrow_io_source.cpp as part of libcudf. Not an issue, just different from what I imagined in the beginning.