From d338d500c5c64a7d26e005478f28a90df3a8d3cd Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Sat, 9 Nov 2024 16:42:24 +0100 Subject: [PATCH 1/6] CUDF_KVIKIO_REMOTE_IO --- cpp/CMakeLists.txt | 12 ++++++++++++ cpp/cmake/thirdparty/get_kvikio.cmake | 2 +- cpp/src/io/utilities/datasource.cpp | 12 +++++++++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 559826ac232..59fcb6f68ed 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -90,6 +90,12 @@ option( mark_as_advanced(CUDF_BUILD_STREAMS_TEST_UTIL) option(CUDF_STATIC_LINTERS "Enable static linters during compilation" OFF) +option( + CUDF_KVIKIO_REMOTE_IO + "Enable remote IO (e.g. AWS S3) support through KvikIO. If disabled, cudf-python will still be able to do remote IO." + ON +) + message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}") message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") message(VERBOSE "CUDF: Configure CMake to build (google & nvbench) benchmarks: ${BUILD_BENCHMARKS}") @@ -109,6 +115,9 @@ message( "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler): ${CUDA_ENABLE_LINEINFO}" ) message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}") +message(VERBOSE + "CUDF: Build with remote IO (e.g. AWS S3) support through KvikIO: ${CUDF_KVIKIO_REMOTE_IO}" +) # Set a default build type if none was specified rapids_cmake_build_type("Release") @@ -890,6 +899,9 @@ target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL # Define spdlog level target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}") +# Enable remote IO through KvikIO +target_compile_definitions(cudf PRIVATE $<$:CUDF_KVIKIO_REMOTE_IO>) + # Compile stringified JIT sources first add_dependencies(cudf jitify_preprocess_run) diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index c949f48505e..73f875b46c2 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -22,7 +22,7 @@ function(find_and_configure_kvikio VERSION) GIT_REPOSITORY https://github.com/rapidsai/kvikio.git GIT_TAG branch-${VERSION} GIT_SHALLOW TRUE SOURCE_SUBDIR cpp - OPTIONS "KvikIO_BUILD_EXAMPLES OFF" + OPTIONS "KvikIO_BUILD_EXAMPLES OFF" "KvikIO_REMOTE_SUPPORT ${CUDF_KVIKIO_REMOTE_IO}" ) include("${rapids-cmake-dir}/export/find_package_root.cmake") diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 9ea39e692b6..fbff0491219 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -391,6 +391,7 @@ class user_datasource_wrapper : public datasource { datasource* const source; ///< A non-owning pointer to the user-implemented datasource }; +#ifdef CUDF_KVIKIO_REMOTE_IO /** * @brief Remote file source backed by KvikIO, which handles S3 filepaths seamlessly. */ @@ -470,7 +471,16 @@ class remote_file_source : public datasource { private: kvikio::RemoteHandle _kvikio_file; }; - +#else +/** + * @brief When KvikIO remote IO is disabled, `is_supported_remote_url()` return false always. + */ +class remote_file_source : public file_source { + public: + explicit remote_file_source(char const* filepath) : file_source(filepath) {} + static constexpr bool is_supported_remote_url(std::string const& url) { return false; } +}; +#endif } // namespace std::unique_ptr datasource::create(std::string const& filepath, From 5613ae0b437b0d5bd51d99713603356a086e646d Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Sun, 10 Nov 2024 11:49:13 +0100 Subject: [PATCH 2/6] guard #include --- cpp/src/io/utilities/datasource.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index fbff0491219..6fa0cf84e40 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -26,7 +26,6 @@ #include #include -#include #include @@ -34,9 +33,14 @@ #include #include -#include #include +#ifdef CUDF_KVIKIO_REMOTE_IO +#include + +#include +#endif + namespace cudf { namespace io { namespace { From c63a20a710f9ce7034baf44b4d95e635fbbdb920 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Sun, 10 Nov 2024 11:50:24 +0100 Subject: [PATCH 3/6] static std::regex pattern --- build.sh | 1 + cpp/src/io/utilities/datasource.cpp | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/build.sh b/build.sh index 56359eae235..155198e99d6 100755 --- a/build.sh +++ b/build.sh @@ -298,6 +298,7 @@ if buildAll || hasArg libcudf; then -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=${BUILD_PER_THREAD_DEFAULT_STREAM} \ -DCUDF_LARGE_STRINGS_DISABLED=${BUILD_DISABLE_LARGE_STRINGS} \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCUDF_KVIKIO_REMOTE_IO=OFF \ ${EXTRA_CMAKE_ARGS} cd ${LIB_BUILD_DIR} diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 6fa0cf84e40..b5a32101aa5 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -34,11 +34,10 @@ #include #include +#include #ifdef CUDF_KVIKIO_REMOTE_IO #include - -#include #endif namespace cudf { @@ -468,7 +467,7 @@ class remote_file_source : public datasource { static bool is_supported_remote_url(std::string const& url) { // Regular expression to match "s3://" - std::regex pattern{R"(^s3://)", std::regex_constants::icase}; + static std::regex pattern{R"(^s3://)", std::regex_constants::icase}; return std::regex_search(url, pattern); } From 605457c1bfb075db937a4c9e1d76009db2524c40 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Sun, 10 Nov 2024 11:50:58 +0100 Subject: [PATCH 4/6] Update cpp/src/io/utilities/datasource.cpp Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- cpp/src/io/utilities/datasource.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index fbff0491219..1f871afa1ab 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -478,7 +478,7 @@ class remote_file_source : public datasource { class remote_file_source : public file_source { public: explicit remote_file_source(char const* filepath) : file_source(filepath) {} - static constexpr bool is_supported_remote_url(std::string const& url) { return false; } + static constexpr bool is_supported_remote_url(std::string const&) { return false; } }; #endif } // namespace From 90afe1d3a5a678fc64a540d8db865aab49bff9ba Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Sun, 10 Nov 2024 11:55:41 +0100 Subject: [PATCH 5/6] doc --- cpp/CMakeLists.txt | 2 +- cpp/src/io/utilities/datasource.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 59fcb6f68ed..65b05fd518b 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -92,7 +92,7 @@ option(CUDF_STATIC_LINTERS "Enable static linters during compilation" OFF) option( CUDF_KVIKIO_REMOTE_IO - "Enable remote IO (e.g. AWS S3) support through KvikIO. If disabled, cudf-python will still be able to do remote IO." + "Enable remote IO (e.g. AWS S3) support through KvikIO. If disabled, cudf-python will still be able to do remote IO through fsspec." ON ) diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 622be6188cc..5ccc91e4220 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -33,8 +33,8 @@ #include #include -#include #include +#include #ifdef CUDF_KVIKIO_REMOTE_IO #include From 6b11639d3bf1c33de9d2924e776a9f06adf0ba59 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Sun, 10 Nov 2024 13:11:02 +0100 Subject: [PATCH 6/6] remove debug: -DCUDF_KVIKIO_REMOTE_IO=OFF --- build.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/build.sh b/build.sh index 155198e99d6..56359eae235 100755 --- a/build.sh +++ b/build.sh @@ -298,7 +298,6 @@ if buildAll || hasArg libcudf; then -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=${BUILD_PER_THREAD_DEFAULT_STREAM} \ -DCUDF_LARGE_STRINGS_DISABLED=${BUILD_DISABLE_LARGE_STRINGS} \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DCUDF_KVIKIO_REMOTE_IO=OFF \ ${EXTRA_CMAKE_ARGS} cd ${LIB_BUILD_DIR}