From 2de5267c565df0c6d76365c387e5e97d718192c3 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 5 Apr 2022 12:59:29 +0200 Subject: [PATCH 01/14] cmake: rapids_cpm_find() KvikIO --- cpp/CMakeLists.txt | 3 +++ cpp/cmake/thirdparty/get_kvikio.cmake | 38 +++++++++++++++++++++++++++ 2 files changed, 41 insertions(+) create mode 100644 cpp/cmake/thirdparty/get_kvikio.cmake diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9936db5b2fa..259dde373ef 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -155,6 +155,8 @@ include(cmake/thirdparty/get_gtest.cmake) include(cmake/Modules/JitifyPreprocessKernels.cmake) # find cuFile include(cmake/Modules/FindcuFile.cmake) +# find KvikIO +include(cmake/thirdparty/get_kvikio.cmake) # ################################################################################################## # * library targets ------------------------------------------------------------------------------- @@ -530,6 +532,7 @@ target_include_directories( cudf PUBLIC "$" "$" + "$" "$" "$" PRIVATE "$" diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake new file mode 100644 index 00000000000..c20c58b7889 --- /dev/null +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -0,0 +1,38 @@ +# ============================================================================= +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# This function finds KvikIO and sets `KvikIO_INCLUDE_DIR` +function(find_and_configure_kvikio VERSION) + + rapids_cpm_find( + KvikIO ${VERSION} + GLOBAL_TARGETS kvikio::kvikio + CPM_ARGS + GIT_REPOSITORY https://github.com/madsbk/kvikio.git + # GIT_REPOSITORY https://github.com/rapidsai/kvikio.git TODO: use this before merge of PR + GIT_TAG used_by_cudf_for_testing + # GIT_TAG branch-${VERSION} TODO: use this before merge of PR + GIT_SHALLOW TRUE SOURCE_SUBDIR cpp + OPTIONS "KvikIO_BUILD_EXAMPLES OFF" + ) + + set(KvikIO_INCLUDE_DIR + ${KvikIO_SOURCE_DIR}/cpp/include + PARENT_SCOPE + ) + +endfunction() + +set(KVIKIO_MIN_VERSION_cudf "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}") +find_and_configure_kvikio(${KVIKIO_MIN_VERSION_cudf}) From 24b98f4b56be9bf4c7d94f3de5286332e523fa88 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 5 Apr 2022 15:47:17 +0200 Subject: [PATCH 02/14] Adding KvikIO as a possible GDS backend --- cpp/src/io/utilities/config_utils.cpp | 5 ++- cpp/src/io/utilities/config_utils.hpp | 5 +++ cpp/src/io/utilities/datasource.cpp | 47 +++++++++++++++++++-------- 3 files changed, 43 insertions(+), 14 deletions(-) diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index a6bfb0d888f..942172f18a0 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -35,7 +35,7 @@ namespace { /** * @brief Defines which cuFile usage to enable. */ -enum class usage_policy : uint8_t { OFF, GDS, ALWAYS }; +enum class usage_policy : uint8_t { OFF, GDS, ALWAYS, KVIKIO }; /** * @brief Get the current usage policy. @@ -46,6 +46,7 @@ usage_policy get_env_policy() if (env_val == "OFF") return usage_policy::OFF; if (env_val == "GDS") return usage_policy::GDS; if (env_val == "ALWAYS") return usage_policy::ALWAYS; + if (env_val == "KVIKIO") return usage_policy::KVIKIO; CUDF_FAIL("Invalid LIBCUDF_CUFILE_POLICY value: " + env_val); } } // namespace @@ -54,6 +55,8 @@ bool is_always_enabled() { return get_env_policy() == usage_policy::ALWAYS; } bool is_gds_enabled() { return is_always_enabled() or get_env_policy() == usage_policy::GDS; } +bool is_kvikio_enabled() { return get_env_policy() == usage_policy::KVIKIO; } + } // namespace cufile_integration namespace nvcomp_integration { diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/config_utils.hpp index 4b993043dd1..04647923ffe 100644 --- a/cpp/src/io/utilities/config_utils.hpp +++ b/cpp/src/io/utilities/config_utils.hpp @@ -48,6 +48,11 @@ bool is_always_enabled(); */ bool is_gds_enabled(); +/** + * @brief Returns true if KvikIO is enabled. + */ +bool is_kvikio_enabled(); + } // namespace cufile_integration namespace nvcomp_integration { diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 6f864ab509f..aa9f3fcdab1 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -20,6 +20,9 @@ #include #include +#include +#include + #include #include #include @@ -33,28 +36,38 @@ namespace { */ class file_source : public datasource { public: - explicit file_source(const char* filepath) - : _file(filepath, O_RDONLY), _cufile_in(detail::make_cufile_input(filepath)) + explicit file_source(const char* filepath) : _file(filepath, O_RDONLY) { + if (detail::cufile_integration::is_kvikio_enabled()) { + _kvikio_file = kvikio::FileHandle(filepath); + _use_kvikio = true; + } else { + _cufile_in = std::unique_ptr(detail::make_cufile_input(filepath)); + } } virtual ~file_source() = default; - [[nodiscard]] bool supports_device_read() const override { return _cufile_in != nullptr; } + [[nodiscard]] bool supports_device_read() const override + { + return _use_kvikio || _cufile_in != nullptr; + } [[nodiscard]] bool is_device_read_preferred(size_t size) const override { - return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size); + return _use_kvikio || (_cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size)); } - std::unique_ptr device_read(size_t offset, - size_t size, - rmm::cuda_stream_view stream) override + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); auto const read_size = std::min(size, _file.size() - offset); - return _cufile_in->read(offset, read_size, stream); + if (_use_kvikio) { return _kvikio_file.pread(dst, read_size, offset); } + return _cufile_in->read_async(offset, read_size, dst, stream); } size_t device_read(size_t offset, @@ -64,19 +77,25 @@ class file_source : public datasource { { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); + if (_use_kvikio) { return device_read_async(offset, size, dst, stream).get(); } auto const read_size = std::min(size, _file.size() - offset); return _cufile_in->read(offset, read_size, dst, stream); } - std::future device_read_async(size_t offset, - size_t size, - uint8_t* dst, - rmm::cuda_stream_view stream) override + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); + if (_use_kvikio) { + rmm::device_buffer out_data(size, stream); + size_t read = device_read(offset, size, reinterpret_cast(out_data.data()), stream); + out_data.resize(read, stream); + return datasource::buffer::create(std::move(out_data)); + } auto const read_size = std::min(size, _file.size() - offset); - return _cufile_in->read_async(offset, read_size, dst, stream); + return _cufile_in->read(offset, read_size, stream); } [[nodiscard]] size_t size() const override { return _file.size(); } @@ -86,6 +105,8 @@ class file_source : public datasource { private: std::unique_ptr _cufile_in; + kvikio::FileHandle _kvikio_file; + bool _use_kvikio{false}; }; /** From 405b1fc5f48b80ac43d8881a111d0ace31ac0047 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 5 Apr 2022 15:58:57 +0200 Subject: [PATCH 03/14] copyrights --- cpp/src/io/utilities/config_utils.cpp | 2 +- cpp/src/io/utilities/config_utils.hpp | 2 +- cpp/src/io/utilities/datasource.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index 942172f18a0..08b5914cb19 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/config_utils.hpp index 04647923ffe..4f6a14091cf 100644 --- a/cpp/src/io/utilities/config_utils.hpp +++ b/cpp/src/io/utilities/config_utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index aa9f3fcdab1..a65ff3a7475 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From ab8aeb0e51796aca8070c6016062b6aaf1f9b48a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 5 Apr 2022 18:49:55 +0200 Subject: [PATCH 04/14] Adding KvikIO as backend for `file_sink` --- cpp/src/io/utilities/data_sink.cpp | 45 ++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 63d0103ddec..0abef498259 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,9 @@ #include "file_io_utilities.hpp" #include #include +#include +#include #include namespace cudf { @@ -30,10 +32,17 @@ namespace io { class file_sink : public data_sink { public: explicit file_sink(std::string const& filepath) - : _cufile_out(detail::make_cufile_output(filepath)) { _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc); CUDF_EXPECTS(_output_stream.is_open(), "Cannot open output file"); + + if (detail::cufile_integration::is_kvikio_enabled()) { + _kvikio_file = kvikio::FileHandle(filepath, "w"); + _use_kvikio = true; + } else { + _cufile_out = + std::unique_ptr(detail::make_cufile_output(filepath)); + } } virtual ~file_sink() { flush(); } @@ -49,36 +58,50 @@ class file_sink : public data_sink { size_t bytes_written() override { return _bytes_written; } - [[nodiscard]] bool supports_device_write() const override { return _cufile_out != nullptr; } + [[nodiscard]] bool supports_device_write() const override + { + return _use_kvikio || _cufile_out != nullptr; + } [[nodiscard]] bool is_device_write_preferred(size_t size) const override { - return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size); + return _use_kvikio || (_cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size)); } - void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override + std::future device_write_async(void const* gpu_data, + size_t size, + rmm::cuda_stream_view stream) override { if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file."); - _cufile_out->write(gpu_data, _bytes_written, size); + size_t offset = _bytes_written; _bytes_written += size; + + if (_use_kvikio) { + // KvikIO's `pwrite()` returns a `std::future` so we convert it + // to `std::future` + return std::async(std::launch::deferred, [this, gpu_data, size, offset] { + _kvikio_file.pwrite(gpu_data, size, offset).get(); + }); + } + return _cufile_out->write_async(gpu_data, offset, size); } - std::future device_write_async(void const* gpu_data, - size_t size, - rmm::cuda_stream_view stream) override + void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override { if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file."); - auto result = _cufile_out->write_async(gpu_data, _bytes_written, size); + if (_use_kvikio) { return device_write_async(gpu_data, _bytes_written, stream).get(); } + _cufile_out->write(gpu_data, _bytes_written, size); _bytes_written += size; - return result; } private: std::ofstream _output_stream; size_t _bytes_written = 0; std::unique_ptr _cufile_out; + kvikio::FileHandle _kvikio_file; + bool _use_kvikio{false}; }; /** From 7e79d2c8d543ad91800e82662bd680aab6c1f985 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 7 Apr 2022 21:58:45 +0200 Subject: [PATCH 05/14] use _kvikio_file.closed() instead of _use_kvikio --- cpp/src/io/utilities/data_sink.cpp | 13 +++++++------ cpp/src/io/utilities/datasource.cpp | 13 ++++++------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 0abef498259..6566ee52090 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -38,7 +38,6 @@ class file_sink : public data_sink { if (detail::cufile_integration::is_kvikio_enabled()) { _kvikio_file = kvikio::FileHandle(filepath, "w"); - _use_kvikio = true; } else { _cufile_out = std::unique_ptr(detail::make_cufile_output(filepath)); @@ -60,12 +59,13 @@ class file_sink : public data_sink { [[nodiscard]] bool supports_device_write() const override { - return _use_kvikio || _cufile_out != nullptr; + return !_kvikio_file.closed() || _cufile_out != nullptr; } [[nodiscard]] bool is_device_write_preferred(size_t size) const override { - return _use_kvikio || (_cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size)); + return !_kvikio_file.closed() || + (_cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size)); } std::future device_write_async(void const* gpu_data, @@ -77,7 +77,7 @@ class file_sink : public data_sink { size_t offset = _bytes_written; _bytes_written += size; - if (_use_kvikio) { + if (!_kvikio_file.closed()) { // KvikIO's `pwrite()` returns a `std::future` so we convert it // to `std::future` return std::async(std::launch::deferred, [this, gpu_data, size, offset] { @@ -91,7 +91,9 @@ class file_sink : public data_sink { { if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file."); - if (_use_kvikio) { return device_write_async(gpu_data, _bytes_written, stream).get(); } + if (!_kvikio_file.closed()) { + return device_write_async(gpu_data, _bytes_written, stream).get(); + } _cufile_out->write(gpu_data, _bytes_written, size); _bytes_written += size; } @@ -101,7 +103,6 @@ class file_sink : public data_sink { size_t _bytes_written = 0; std::unique_ptr _cufile_out; kvikio::FileHandle _kvikio_file; - bool _use_kvikio{false}; }; /** diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index a65ff3a7475..448ccbc1917 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -40,7 +40,6 @@ class file_source : public datasource { { if (detail::cufile_integration::is_kvikio_enabled()) { _kvikio_file = kvikio::FileHandle(filepath); - _use_kvikio = true; } else { _cufile_in = std::unique_ptr(detail::make_cufile_input(filepath)); } @@ -50,12 +49,13 @@ class file_source : public datasource { [[nodiscard]] bool supports_device_read() const override { - return _use_kvikio || _cufile_in != nullptr; + return !_kvikio_file.closed() || _cufile_in != nullptr; } [[nodiscard]] bool is_device_read_preferred(size_t size) const override { - return _use_kvikio || (_cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size)); + return !_kvikio_file.closed() || + (_cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size)); } std::future device_read_async(size_t offset, @@ -66,7 +66,7 @@ class file_source : public datasource { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); auto const read_size = std::min(size, _file.size() - offset); - if (_use_kvikio) { return _kvikio_file.pread(dst, read_size, offset); } + if (!_kvikio_file.closed()) { return _kvikio_file.pread(dst, read_size, offset); } return _cufile_in->read_async(offset, read_size, dst, stream); } @@ -77,7 +77,7 @@ class file_source : public datasource { { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - if (_use_kvikio) { return device_read_async(offset, size, dst, stream).get(); } + if (!_kvikio_file.closed()) { return device_read_async(offset, size, dst, stream).get(); } auto const read_size = std::min(size, _file.size() - offset); return _cufile_in->read(offset, read_size, dst, stream); } @@ -88,7 +88,7 @@ class file_source : public datasource { { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - if (_use_kvikio) { + if (!_kvikio_file.closed()) { rmm::device_buffer out_data(size, stream); size_t read = device_read(offset, size, reinterpret_cast(out_data.data()), stream); out_data.resize(read, stream); @@ -106,7 +106,6 @@ class file_source : public datasource { private: std::unique_ptr _cufile_in; kvikio::FileHandle _kvikio_file; - bool _use_kvikio{false}; }; /** From bcb0a8ac0a53d551af25a002313b09e91edff887 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 7 Apr 2022 21:01:31 +0200 Subject: [PATCH 06/14] Linking to kvikio::kvikio --- cpp/CMakeLists.txt | 3 +-- cpp/cmake/thirdparty/get_kvikio.cmake | 7 +------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 259dde373ef..b3582028e17 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -532,7 +532,6 @@ target_include_directories( cudf PUBLIC "$" "$" - "$" "$" "$" PRIVATE "$" @@ -572,7 +571,7 @@ add_dependencies(cudf jitify_preprocess_run) target_link_libraries( cudf PUBLIC ${ARROW_LIBRARIES} libcudacxx::libcudacxx cudf::Thrust rmm::rmm - PRIVATE cuco::cuco ZLIB::ZLIB nvcomp::nvcomp + PRIVATE cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio ) # Add Conda library, and include paths if specified diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index c20c58b7889..5b02759cecd 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -12,7 +12,7 @@ # the License. # ============================================================================= -# This function finds KvikIO and sets `KvikIO_INCLUDE_DIR` +# This function finds KvikIO function(find_and_configure_kvikio VERSION) rapids_cpm_find( @@ -27,11 +27,6 @@ function(find_and_configure_kvikio VERSION) OPTIONS "KvikIO_BUILD_EXAMPLES OFF" ) - set(KvikIO_INCLUDE_DIR - ${KvikIO_SOURCE_DIR}/cpp/include - PARENT_SCOPE - ) - endfunction() set(KVIKIO_MIN_VERSION_cudf "${CUDF_VERSION_MAJOR}.${CUDF_VERSION_MINOR}") From 6f0b18498a72d71bd6dfc876c75e6bcc30faf5a3 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 7 Apr 2022 22:41:45 +0200 Subject: [PATCH 07/14] use cufile_input_impl::read_async and cufile_output_impl::write_async --- cpp/src/io/utilities/data_sink.cpp | 7 +-- cpp/src/io/utilities/datasource.cpp | 17 ++---- cpp/src/io/utilities/file_io_utilities.cpp | 24 --------- cpp/src/io/utilities/file_io_utilities.hpp | 63 ---------------------- 4 files changed, 6 insertions(+), 105 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 6566ee52090..ce68cd8ed0d 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -90,12 +90,7 @@ class file_sink : public data_sink { void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override { if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file."); - - if (!_kvikio_file.closed()) { - return device_write_async(gpu_data, _bytes_written, stream).get(); - } - _cufile_out->write(gpu_data, _bytes_written, size); - _bytes_written += size; + return device_write_async(gpu_data, _bytes_written, stream).get(); } private: diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 448ccbc1917..50074970760 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -76,10 +76,7 @@ class file_source : public datasource { rmm::cuda_stream_view stream) override { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - - if (!_kvikio_file.closed()) { return device_read_async(offset, size, dst, stream).get(); } - auto const read_size = std::min(size, _file.size() - offset); - return _cufile_in->read(offset, read_size, dst, stream); + return device_read_async(offset, size, dst, stream).get(); } std::unique_ptr device_read(size_t offset, @@ -88,14 +85,10 @@ class file_source : public datasource { { CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - if (!_kvikio_file.closed()) { - rmm::device_buffer out_data(size, stream); - size_t read = device_read(offset, size, reinterpret_cast(out_data.data()), stream); - out_data.resize(read, stream); - return datasource::buffer::create(std::move(out_data)); - } - auto const read_size = std::min(size, _file.size() - offset); - return _cufile_in->read(offset, read_size, stream); + rmm::device_buffer out_data(size, stream); + size_t read = device_read(offset, size, reinterpret_cast(out_data.data()), stream); + out_data.resize(read, stream); + return datasource::buffer::create(std::move(out_data)); } [[nodiscard]] size_t size() const override { return _file.size(); } diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp index f7e250f1d3f..c0dd85702e2 100644 --- a/cpp/src/io/utilities/file_io_utilities.cpp +++ b/cpp/src/io/utilities/file_io_utilities.cpp @@ -176,16 +176,6 @@ cufile_input_impl::cufile_input_impl(std::string const& filepath) pool.sleep_duration = 10; } -std::unique_ptr cufile_input_impl::read(size_t offset, - size_t size, - rmm::cuda_stream_view stream) -{ - rmm::device_buffer out_data(size, stream); - auto read_size = read(offset, size, reinterpret_cast(out_data.data()), stream); - out_data.resize(read_size, stream); - return datasource::buffer::create(std::move(out_data)); -} - namespace { template cufile_input_impl::read_async(size_t offset, return std::async(std::launch::deferred, waiter, std::move(slice_tasks)); } -size_t cufile_input_impl::read(size_t offset, - size_t size, - uint8_t* dst, - rmm::cuda_stream_view stream) -{ - auto result = read_async(offset, size, dst, stream); - return result.get(); -} - cufile_output_impl::cufile_output_impl(std::string const& filepath) : shim{cufile_shim::instance()}, cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664), @@ -250,11 +231,6 @@ cufile_output_impl::cufile_output_impl(std::string const& filepath) { } -void cufile_output_impl::write(void const* data, size_t offset, size_t size) -{ - write_async(data, offset, size).wait(); -} - std::future cufile_output_impl::write_async(void const* data, size_t offset, size_t size) { int device; diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp index be3ecc49ab0..704ee77de8a 100644 --- a/cpp/src/io/utilities/file_io_utilities.hpp +++ b/cpp/src/io/utilities/file_io_utilities.hpp @@ -80,35 +80,6 @@ class cufile_io_base { */ class cufile_input : public cufile_io_base { public: - /** - * @brief Reads into a new device buffer. - * - * @throws cudf::logic_error on cuFile error - * - * @param offset Number of bytes from the start - * @param size Number of bytes to read - * @param stream CUDA stream to use - * - * @return The data buffer in the device memory - */ - virtual std::unique_ptr read(size_t offset, - size_t size, - rmm::cuda_stream_view stream) = 0; - - /** - * @brief Reads into existing device memory. - * - * @throws cudf::logic_error on cuFile error - * - * @param offset Number of bytes from the start - * @param size Number of bytes to read - * @param dst Address of the existing device memory - * @param stream CUDA stream to use - * - * @return The number of bytes read - */ - virtual size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) = 0; - /** * @brief Asynchronously reads into existing device memory. * @@ -132,17 +103,6 @@ class cufile_input : public cufile_io_base { */ class cufile_output : public cufile_io_base { public: - /** - * @brief Writes the data from a device buffer into a file. - * - * @throws cudf::logic_error on cuFile error - * - * @param data Pointer to the buffer to be written into the output file - * @param offset Number of bytes from the start - * @param size Number of bytes to write - */ - virtual void write(void const* data, size_t offset, size_t size) = 0; - /** * @brief Asynchronously writes the data from a device buffer into a file. * @@ -203,12 +163,6 @@ class cufile_input_impl final : public cufile_input { public: cufile_input_impl(std::string const& filepath); - std::unique_ptr read(size_t offset, - size_t size, - rmm::cuda_stream_view stream) override; - - size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) override; - std::future read_async(size_t offset, size_t size, uint8_t* dst, @@ -229,7 +183,6 @@ class cufile_output_impl final : public cufile_output { public: cufile_output_impl(std::string const& filepath); - void write(void const* data, size_t offset, size_t size) override; std::future write_async(void const* data, size_t offset, size_t size) override; private: @@ -241,18 +194,6 @@ class cufile_output_impl final : public cufile_output { class cufile_input_impl final : public cufile_input { public: - std::unique_ptr read(size_t offset, - size_t size, - rmm::cuda_stream_view stream) override - { - CUDF_FAIL("Only used to compile without cufile library, should not be called"); - } - - size_t read(size_t offset, size_t size, uint8_t* dst, rmm::cuda_stream_view stream) override - { - CUDF_FAIL("Only used to compile without cufile library, should not be called"); - } - std::future read_async(size_t offset, size_t size, uint8_t* dst, @@ -264,10 +205,6 @@ class cufile_input_impl final : public cufile_input { class cufile_output_impl final : public cufile_output { public: - void write(void const* data, size_t offset, size_t size) override - { - CUDF_FAIL("Only used to compile without cufile library, should not be called"); - } std::future write_async(void const* data, size_t offset, size_t size) override { CUDF_FAIL("Only used to compile without cufile library, should not be called"); From 8d277fc68521660bf5f9560964339fb7b333eb0f Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 19 Apr 2022 09:12:49 +0200 Subject: [PATCH 08/14] Removed duplicated exceptions --- cpp/src/io/utilities/datasource.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 50074970760..7318f0d501e 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -75,7 +75,6 @@ class file_source : public datasource { uint8_t* dst, rmm::cuda_stream_view stream) override { - CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); return device_read_async(offset, size, dst, stream).get(); } @@ -83,8 +82,6 @@ class file_source : public datasource { size_t size, rmm::cuda_stream_view stream) override { - CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - rmm::device_buffer out_data(size, stream); size_t read = device_read(offset, size, reinterpret_cast(out_data.data()), stream); out_data.resize(read, stream); From 84986d15e63f78a2bc6a3e3700a0b16b45b6832a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 19 Apr 2022 09:19:28 +0200 Subject: [PATCH 09/14] get_kvikio: use https://github.com/rapidsai/kvikio.git --- cpp/cmake/thirdparty/get_kvikio.cmake | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index 5b02759cecd..800ab2d5c6f 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -19,10 +19,8 @@ function(find_and_configure_kvikio VERSION) KvikIO ${VERSION} GLOBAL_TARGETS kvikio::kvikio CPM_ARGS - GIT_REPOSITORY https://github.com/madsbk/kvikio.git - # GIT_REPOSITORY https://github.com/rapidsai/kvikio.git TODO: use this before merge of PR - GIT_TAG used_by_cudf_for_testing - # GIT_TAG branch-${VERSION} TODO: use this before merge of PR + GIT_REPOSITORY https://github.com/rapidsai/kvikio.git + GIT_TAG branch-${VERSION} GIT_SHALLOW TRUE SOURCE_SUBDIR cpp OPTIONS "KvikIO_BUILD_EXAMPLES OFF" ) From a8cab3cfc2f32951af159bdbcff393d587f26392 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 19 Apr 2022 09:43:58 +0200 Subject: [PATCH 10/14] Clean up Co-authored-by: Vukasin Milovanovic --- cpp/src/io/utilities/data_sink.cpp | 2 +- cpp/src/io/utilities/datasource.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index ce68cd8ed0d..b3016a60a10 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -40,7 +40,7 @@ class file_sink : public data_sink { _kvikio_file = kvikio::FileHandle(filepath, "w"); } else { _cufile_out = - std::unique_ptr(detail::make_cufile_output(filepath)); + detail::make_cufile_output(filepath); } } diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 7318f0d501e..80e07f31dd9 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -41,7 +41,7 @@ class file_source : public datasource { if (detail::cufile_integration::is_kvikio_enabled()) { _kvikio_file = kvikio::FileHandle(filepath); } else { - _cufile_in = std::unique_ptr(detail::make_cufile_input(filepath)); + _cufile_in = detail::make_cufile_input(filepath); } } From 79965f499eb5e3c1b770099f2314234e665629fe Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 19 Apr 2022 09:52:25 +0200 Subject: [PATCH 11/14] style: clang format --- cpp/src/io/utilities/data_sink.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index b3016a60a10..042afc01253 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -39,8 +39,7 @@ class file_sink : public data_sink { if (detail::cufile_integration::is_kvikio_enabled()) { _kvikio_file = kvikio::FileHandle(filepath, "w"); } else { - _cufile_out = - detail::make_cufile_output(filepath); + _cufile_out = detail::make_cufile_output(filepath); } } From fad1055e92806bf6817ba4859042ef95f80bce5d Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 20 Apr 2022 10:44:14 +0200 Subject: [PATCH 12/14] doc --- docs/cudf/source/basics/io-gds-integration.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst index 5ff07ac29c5..0906c18b21b 100644 --- a/docs/cudf/source/basics/io-gds-integration.rst +++ b/docs/cudf/source/basics/io-gds-integration.rst @@ -10,10 +10,11 @@ GDS is also included in CUDA Toolkit 11.4 and higher. Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``. This variable also controls the GDS compatibility mode. -There are three valid values for the environment variable: +There are four valid values for the environment variable: - "GDS": Enable GDS use; GDS compatibility mode is *off*. - "ALWAYS": Enable GDS use; GDS compatibility mode is *on*. +- "KVIKIO": Enable GDS through `KvikIO `_. - "OFF": Completely disable GDS use. If no value is set, behavior will be the same as the "GDS" option. @@ -21,7 +22,9 @@ If no value is set, behavior will be the same as the "GDS" option. This environment variable also affects how cuDF treats GDS errors. When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers. When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), -cuDF throws an exception to propagate the error to te user. +cuDF throws an exception to propagate the error to the user. +When ``LIBCUDF_CUFILE_POLICY`` is set to "KVIKIO" and a GDS API call fails for any reason cuDF throws an exception to propagate the error to the user. +For more information about error handling, compatibility mode, and tuning parameters in KvikIO see: https://github.com/rapidsai/kvikio Operations that support the use of GPUDirect Storage: @@ -36,4 +39,4 @@ Several parameters that can be used to tune the performance of GDS-enabled I/O a - ``LIBCUDF_CUFILE_THREAD_COUNT``: Integral value, maximum number of parallel reads/writes per file (default 16); - ``LIBCUDF_CUFILE_SLICE_SIZE``: Integral value, maximum size of each GDS read/write, in bytes (default 4MB). - Larger I/O operations are split into multiple calls. \ No newline at end of file + Larger I/O operations are split into multiple calls. From a4aa99f8c02cc3563d5ac0739d7e39d92ba50d75 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 20 Apr 2022 20:49:54 +0200 Subject: [PATCH 13/14] more doc --- docs/cudf/source/basics/io-gds-integration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst index 0906c18b21b..8e1ba1eafcc 100644 --- a/docs/cudf/source/basics/io-gds-integration.rst +++ b/docs/cudf/source/basics/io-gds-integration.rst @@ -23,7 +23,7 @@ This environment variable also affects how cuDF treats GDS errors. When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers. When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), cuDF throws an exception to propagate the error to the user. -When ``LIBCUDF_CUFILE_POLICY`` is set to "KVIKIO" and a GDS API call fails for any reason cuDF throws an exception to propagate the error to the user. +When ``LIBCUDF_CUFILE_POLICY`` is set to "KVIKIO" and a KvikIO API call fails for any reason (unlikely, given that the KvikIO implements its own compatibility mode) cuDF throws an exception to propagate the error to the user. For more information about error handling, compatibility mode, and tuning parameters in KvikIO see: https://github.com/rapidsai/kvikio Operations that support the use of GPUDirect Storage: From 1a6341e224444028ab7389a30129dc5c0751ee1c Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 20 Apr 2022 20:52:29 +0200 Subject: [PATCH 14/14] typo --- docs/cudf/source/basics/io-gds-integration.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst index 8e1ba1eafcc..ce774453386 100644 --- a/docs/cudf/source/basics/io-gds-integration.rst +++ b/docs/cudf/source/basics/io-gds-integration.rst @@ -23,7 +23,7 @@ This environment variable also affects how cuDF treats GDS errors. When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers. When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), cuDF throws an exception to propagate the error to the user. -When ``LIBCUDF_CUFILE_POLICY`` is set to "KVIKIO" and a KvikIO API call fails for any reason (unlikely, given that the KvikIO implements its own compatibility mode) cuDF throws an exception to propagate the error to the user. +When ``LIBCUDF_CUFILE_POLICY`` is set to "KVIKIO" and a KvikIO API call fails for any reason (unlikely, given that KvikIO implements its own compatibility mode) cuDF throws an exception to propagate the error to the user. For more information about error handling, compatibility mode, and tuning parameters in KvikIO see: https://github.com/rapidsai/kvikio Operations that support the use of GPUDirect Storage: