From a59edcf0e6b66c4d38457c813638b8f080b7823e Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 21 Mar 2022 14:34:48 +0100 Subject: [PATCH 01/10] cmake: get KvikIO --- cpp/CMakeLists.txt | 3 +++ cpp/cmake/thirdparty/get_kvikio.cmake | 34 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 cpp/cmake/thirdparty/get_kvikio.cmake diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9750ed7c5c4..7dd8b3c83ab 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -155,6 +155,8 @@ include(cmake/thirdparty/get_gtest.cmake) include(cmake/Modules/JitifyPreprocessKernels.cmake) # find cuFile include(cmake/Modules/FindcuFile.cmake) +# find KvikIO +include(cmake/thirdparty/get_kvikio.cmake) # ################################################################################################## # * library targets ------------------------------------------------------------------------------- @@ -529,6 +531,7 @@ target_include_directories( cudf PUBLIC "$" "$" + "$" "$" "$" PRIVATE "$" diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake new file mode 100644 index 00000000000..c6ee469d960 --- /dev/null +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -0,0 +1,34 @@ +# ============================================================================= +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# This function finds KvikIO and sets `KvikIO_INCLUDE_DIR` +function(find_and_configure_kvikio) + + rapids_cpm_find( + KvikIO 22.04 + GLOBAL_TARGETS kvikio::kvikio + # CPM_ARGS GIT_REPOSITORY https://github.com/rapidsai/kvikio.git + CPM_ARGS + GIT_REPOSITORY https://github.com/madsbk/kvikio.git SOURCE_SUBDIR cpp + GIT_TAG file_size # TODO: use version tags when they become available + OPTIONS "KvikIO_BUILD_EXAMPLES FALSE" # No need to build the KvikIO example + ) + set(KvikIO_INCLUDE_DIR + ${KvikIO_SOURCE_DIR}/cpp/include + PARENT_SCOPE + ) + +endfunction() + +find_and_configure_kvikio() From a8782a67f0af5e36eb1eac73e39aad0d83204236 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 21 Mar 2022 14:35:34 +0100 Subject: [PATCH 02/10] datasource: use KvikIO --- cpp/src/io/utilities/datasource.cpp | 87 ++++++++++++----------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 6f864ab509f..3e5f2666b29 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,12 +14,13 @@ * limitations under the License. */ -#include "file_io_utilities.hpp" - #include #include #include +#include +#include + #include #include #include @@ -33,28 +34,23 @@ namespace { */ class file_source : public datasource { public: - explicit file_source(const char* filepath) - : _file(filepath, O_RDONLY), _cufile_in(detail::make_cufile_input(filepath)) - { - } + explicit file_source(const char* filepath) : _file(filepath) {} - virtual ~file_source() = default; + ~file_source() override = default; - [[nodiscard]] bool supports_device_read() const override { return _cufile_in != nullptr; } + [[nodiscard]] bool supports_device_read() const override { return true; } [[nodiscard]] bool is_device_read_preferred(size_t size) const override { - return _cufile_in != nullptr && _cufile_in->is_cufile_io_preferred(size); + return size > (128 << 10); } - std::unique_ptr device_read(size_t offset, - size_t size, - rmm::cuda_stream_view stream) override + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override { - CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - - auto const read_size = std::min(size, _file.size() - offset); - return _cufile_in->read(offset, read_size, stream); + return _file.pread(dst, std::min(size, _file.nbytes() - offset), offset); } size_t device_read(size_t offset, @@ -62,30 +58,24 @@ class file_source : public datasource { uint8_t* dst, rmm::cuda_stream_view stream) override { - CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - - auto const read_size = std::min(size, _file.size() - offset); - return _cufile_in->read(offset, read_size, dst, stream); + return device_read_async(offset, size, dst, stream).get(); } - std::future device_read_async(size_t offset, - size_t size, - uint8_t* dst, - rmm::cuda_stream_view stream) override + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override { - CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file."); - - auto const read_size = std::min(size, _file.size() - offset); - return _cufile_in->read_async(offset, read_size, dst, stream); + rmm::device_buffer out_data(size, stream); + size_t read_size = + device_read(offset, size, reinterpret_cast(out_data.data()), stream); + out_data.resize(read_size, stream); + return datasource::buffer::create(std::move(out_data)); } - [[nodiscard]] size_t size() const override { return _file.size(); } + [[nodiscard]] size_t size() const override { return _file.nbytes(); } protected: - detail::file_wrapper _file; - - private: - std::unique_ptr _cufile_in; + kvikio::FileHandle _file; }; /** @@ -99,7 +89,7 @@ class memory_mapped_source : public file_source { explicit memory_mapped_source(const char* filepath, size_t offset, size_t size) : file_source(filepath) { - if (_file.size() != 0) map(_file.desc(), offset, size); + if (_file.nbytes() != 0) map(_file.fd(), offset, size); } ~memory_mapped_source() override @@ -133,12 +123,12 @@ class memory_mapped_source : public file_source { private: void map(int fd, size_t offset, size_t size) { - CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file"); + CUDF_EXPECTS(offset < _file.nbytes(), "Offset is past end of file"); // Offset for `mmap()` must be page aligned _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1); - if (size == 0 || (offset + size) > _file.size()) { size = _file.size() - offset; } + if (size == 0 || (offset + size) > _file.nbytes()) { size = _file.nbytes() - offset; } // Size for `mmap()` needs to include the page padding _map_size = size + (offset - _map_offset); @@ -166,24 +156,24 @@ class direct_read_source : public file_source { std::unique_ptr host_read(size_t offset, size_t size) override { - lseek(_file.desc(), offset, SEEK_SET); + lseek(_file.fd(), offset, SEEK_SET); // Clamp length to available data - ssize_t const read_size = std::min(size, _file.size() - offset); + ssize_t const read_size = std::min(size, _file.nbytes() - offset); std::vector v(read_size); - CUDF_EXPECTS(read(_file.desc(), v.data(), read_size) == read_size, "read failed"); + CUDF_EXPECTS(read(_file.fd(), v.data(), read_size) == read_size, "read failed"); return buffer::create(std::move(v)); } size_t host_read(size_t offset, size_t size, uint8_t* dst) override { - lseek(_file.desc(), offset, SEEK_SET); + lseek(_file.fd(), offset, SEEK_SET); // Clamp length to available data - auto const read_size = std::min(size, _file.size() - offset); + auto const read_size = std::min(size, _file.nbytes() - offset); - CUDF_EXPECTS(read(_file.desc(), dst, read_size) == static_cast(read_size), + CUDF_EXPECTS(read(_file.fd(), dst, read_size) == static_cast(read_size), "read failed"); return read_size; } @@ -242,14 +232,11 @@ std::unique_ptr datasource::create(const std::string& filepath, size_t offset, size_t size) { -#ifdef CUFILE_FOUND - if (detail::cufile_integration::is_always_enabled()) { - // avoid mmap as GDS is expected to be used for most reads - return std::make_unique(filepath.c_str()); - } -#endif + // TODO: make configurable + // avoid mmap as GDS is expected to be used for most reads + return std::make_unique(filepath.c_str()); // Use our own memory mapping implementation for direct file reads - return std::make_unique(filepath.c_str(), offset, size); + // return std::make_unique(filepath.c_str(), offset, size); } std::unique_ptr datasource::create(host_buffer const& buffer) From 463770a58928287a2c31cb66b91af33f80811117 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Tue, 22 Mar 2022 08:12:11 +0100 Subject: [PATCH 03/10] cmake: use `GIT_TAG no_cufile` --- cpp/cmake/thirdparty/get_kvikio.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index c6ee469d960..93eaa0f0071 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -21,7 +21,7 @@ function(find_and_configure_kvikio) # CPM_ARGS GIT_REPOSITORY https://github.com/rapidsai/kvikio.git CPM_ARGS GIT_REPOSITORY https://github.com/madsbk/kvikio.git SOURCE_SUBDIR cpp - GIT_TAG file_size # TODO: use version tags when they become available + GIT_TAG no_cufile # TODO: use version tags when they become available OPTIONS "KvikIO_BUILD_EXAMPLES FALSE" # No need to build the KvikIO example ) set(KvikIO_INCLUDE_DIR From 061c8752bbba03c9d94ff0f02d734ea86cf5f673 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 23 Mar 2022 09:03:58 +0100 Subject: [PATCH 04/10] Adding "Offset is past end of file" checks --- cpp/src/io/utilities/datasource.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 3e5f2666b29..6743ada8790 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -156,6 +156,7 @@ class direct_read_source : public file_source { std::unique_ptr host_read(size_t offset, size_t size) override { + CUDF_EXPECTS(offset < _file.nbytes(), "Offset is past end of file"); lseek(_file.fd(), offset, SEEK_SET); // Clamp length to available data @@ -168,6 +169,7 @@ class direct_read_source : public file_source { size_t host_read(size_t offset, size_t size, uint8_t* dst) override { + CUDF_EXPECTS(offset < _file.nbytes(), "Offset is past end of file"); lseek(_file.fd(), offset, SEEK_SET); // Clamp length to available data From 5cc9294e306b03c80d5e5f34f2ede9f0a7d1197a Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 23 Mar 2022 11:24:14 +0100 Subject: [PATCH 05/10] data_sink: use KvikIO --- cpp/src/io/utilities/data_sink.cpp | 36 ++++++++++++++---------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 63d0103ddec..3bcbb44d76f 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,10 +16,10 @@ #include -#include "file_io_utilities.hpp" #include #include +#include #include namespace cudf { @@ -29,8 +29,7 @@ namespace io { */ class file_sink : public data_sink { public: - explicit file_sink(std::string const& filepath) - : _cufile_out(detail::make_cufile_output(filepath)) + explicit file_sink(std::string const& filepath) : _file(filepath, "w") { _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc); CUDF_EXPECTS(_output_stream.is_open(), "Cannot open output file"); @@ -49,36 +48,35 @@ class file_sink : public data_sink { size_t bytes_written() override { return _bytes_written; } - [[nodiscard]] bool supports_device_write() const override { return _cufile_out != nullptr; } + [[nodiscard]] bool supports_device_write() const override { return true; } [[nodiscard]] bool is_device_write_preferred(size_t size) const override { - return _cufile_out != nullptr && _cufile_out->is_cufile_io_preferred(size); - } - - void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override - { - if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file."); - - _cufile_out->write(gpu_data, _bytes_written, size); - _bytes_written += size; + return size > (128 << 10); } std::future device_write_async(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override { - if (!supports_device_write()) CUDF_FAIL("Device writes are not supported for this file."); - - auto result = _cufile_out->write_async(gpu_data, _bytes_written, size); + // KvikIO's `pwrite()` returns a `std::future` so we convert it + // to `std::future` + size_t offset = _bytes_written; _bytes_written += size; - return result; + return std::async(std::launch::deferred, [this, gpu_data, size, offset] { + _file.pwrite(gpu_data, size, offset).get(); + }); + } + + void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override + { + return device_write_async(gpu_data, size, stream).get(); } private: std::ofstream _output_stream; size_t _bytes_written = 0; - std::unique_ptr _cufile_out; + kvikio::FileHandle _file; }; /** From 9eda10d6468a2cc955478da0878c5bf656185dcb Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Wed, 23 Mar 2022 15:11:12 +0100 Subject: [PATCH 06/10] Use cufile_integration::is_gds_enabled() --- cpp/src/io/utilities/data_sink.cpp | 7 ++++++- cpp/src/io/utilities/datasource.cpp | 20 +++++++++++++------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 3bcbb44d76f..632aad96d22 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -16,6 +16,8 @@ #include +#include + #include #include @@ -48,7 +50,10 @@ class file_sink : public data_sink { size_t bytes_written() override { return _bytes_written; } - [[nodiscard]] bool supports_device_write() const override { return true; } + [[nodiscard]] bool supports_device_write() const override + { + return detail::cufile_integration::is_gds_enabled(); + } [[nodiscard]] bool is_device_write_preferred(size_t size) const override { diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 6743ada8790..0de07bfb63f 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -14,9 +14,10 @@ * limitations under the License. */ +#include + #include #include -#include #include #include @@ -36,9 +37,12 @@ class file_source : public datasource { public: explicit file_source(const char* filepath) : _file(filepath) {} - ~file_source() override = default; + virtual ~file_source() = default; - [[nodiscard]] bool supports_device_read() const override { return true; } + [[nodiscard]] bool supports_device_read() const override + { + return detail::cufile_integration::is_gds_enabled(); + } [[nodiscard]] bool is_device_read_preferred(size_t size) const override { @@ -234,11 +238,13 @@ std::unique_ptr datasource::create(const std::string& filepath, size_t offset, size_t size) { - // TODO: make configurable - // avoid mmap as GDS is expected to be used for most reads - return std::make_unique(filepath.c_str()); + if (detail::cufile_integration::is_gds_enabled()) { + // avoid mmap as GDS is expected to be used for most reads + return std::make_unique(filepath.c_str()); + } + // Use our own memory mapping implementation for direct file reads - // return std::make_unique(filepath.c_str(), offset, size); + return std::make_unique(filepath.c_str(), offset, size); } std::unique_ptr datasource::create(host_buffer const& buffer) From 4e5ec48249cee135a5f4238d6bb593345429c630 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 24 Mar 2022 15:18:41 +0100 Subject: [PATCH 07/10] Remove overhead of get_env_policy() --- cpp/src/io/utilities/config_utils.cpp | 14 ++++++++++---- cpp/src/io/utilities/data_sink.cpp | 5 ----- cpp/src/io/utilities/datasource.cpp | 5 ----- 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index a6bfb0d888f..2d20db342d1 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -37,10 +37,7 @@ namespace { */ enum class usage_policy : uint8_t { OFF, GDS, ALWAYS }; -/** - * @brief Get the current usage policy. - */ -usage_policy get_env_policy() +usage_policy _get_env_policy() { static auto const env_val = getenv_or("LIBCUDF_CUFILE_POLICY", "GDS"); if (env_val == "OFF") return usage_policy::OFF; @@ -48,6 +45,15 @@ usage_policy get_env_policy() if (env_val == "ALWAYS") return usage_policy::ALWAYS; CUDF_FAIL("Invalid LIBCUDF_CUFILE_POLICY value: " + env_val); } + +/** + * @brief Get the current usage policy. + */ +usage_policy get_env_policy() +{ + static auto const ret = _get_env_policy(); + return ret; +} } // namespace bool is_always_enabled() { return get_env_policy() == usage_policy::ALWAYS; } diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index 632aad96d22..718030d093e 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -55,11 +55,6 @@ class file_sink : public data_sink { return detail::cufile_integration::is_gds_enabled(); } - [[nodiscard]] bool is_device_write_preferred(size_t size) const override - { - return size > (128 << 10); - } - std::future device_write_async(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 0de07bfb63f..4759e48ffea 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -44,11 +44,6 @@ class file_source : public datasource { return detail::cufile_integration::is_gds_enabled(); } - [[nodiscard]] bool is_device_read_preferred(size_t size) const override - { - return size > (128 << 10); - } - std::future device_read_async(size_t offset, size_t size, uint8_t* dst, From a0241d6b9687555fdd54212562b8f2e9558e9cd7 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 24 Mar 2022 15:19:27 +0100 Subject: [PATCH 08/10] Use memory_mapped_source by default Do we want an option to enable the use of `direct_read_source`? --- cpp/src/io/utilities/datasource.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 4759e48ffea..6c44330d55b 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -233,12 +233,12 @@ std::unique_ptr datasource::create(const std::string& filepath, size_t offset, size_t size) { - if (detail::cufile_integration::is_gds_enabled()) { - // avoid mmap as GDS is expected to be used for most reads - return std::make_unique(filepath.c_str()); - } + // TODO: do we want an option to enable the use of `direct_read_source` instead + // of `memory_mapped_source`? + // return std::make_unique(filepath.c_str()); - // Use our own memory mapping implementation for direct file reads + // Notice, some readers, such as avro, will call `host_read()` on more data + // than what is actually accessed thus the lazy nature of mmap is essential. return std::make_unique(filepath.c_str(), offset, size); } From 4ca2a7454bf1a3c71dffedd51322d25f725098c8 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 24 Mar 2022 16:34:00 +0100 Subject: [PATCH 09/10] config_utils.cpp: fixed copyright year --- cpp/src/io/utilities/config_utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index 2d20db342d1..45d2155da09 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 3356645084fee229658408b9cd997ad2ffeac892 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Mon, 28 Mar 2022 11:20:03 +0200 Subject: [PATCH 10/10] cmake: pull KvikIO from my private repos for testing TODO: before merging, this should be changed to KvikIO main repos --- cpp/cmake/thirdparty/get_kvikio.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index 93eaa0f0071..0c5ca62f41c 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -19,9 +19,9 @@ function(find_and_configure_kvikio) KvikIO 22.04 GLOBAL_TARGETS kvikio::kvikio # CPM_ARGS GIT_REPOSITORY https://github.com/rapidsai/kvikio.git - CPM_ARGS + CPM_ARGS # TODO: use version tags when they become available GIT_REPOSITORY https://github.com/madsbk/kvikio.git SOURCE_SUBDIR cpp - GIT_TAG no_cufile # TODO: use version tags when they become available + GIT_TAG used_by_cudf_for_testing OPTIONS "KvikIO_BUILD_EXAMPLES FALSE" # No need to build the KvikIO example ) set(KvikIO_INCLUDE_DIR