Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add environment variables for I/O thread pool and slice sizes #10218

Merged
merged 3 commits into from
Feb 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/src/io/utilities/config_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ enum class usage_policy : uint8_t { OFF, GDS, ALWAYS };
*/
usage_policy get_env_policy()
{
static auto const env_val = getenv_or("LIBCUDF_CUFILE_POLICY", "GDS");
static auto const env_val = getenv_or<std::string>("LIBCUDF_CUFILE_POLICY", "GDS");
if (env_val == "OFF") return usage_policy::OFF;
if (env_val == "GDS") return usage_policy::GDS;
if (env_val == "ALWAYS") return usage_policy::ALWAYS;
Expand All @@ -69,7 +69,7 @@ enum class usage_policy : uint8_t { OFF, STABLE, ALWAYS };
*/
usage_policy get_env_policy()
{
static auto const env_val = getenv_or("LIBCUDF_NVCOMP_POLICY", "STABLE");
static auto const env_val = getenv_or<std::string>("LIBCUDF_NVCOMP_POLICY", "STABLE");
if (env_val == "OFF") return usage_policy::OFF;
if (env_val == "STABLE") return usage_policy::STABLE;
if (env_val == "ALWAYS") return usage_policy::ALWAYS;
Expand Down
13 changes: 12 additions & 1 deletion cpp/src/io/utilities/config_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
#pragma once

#include <sstream>
#include <string>

namespace cudf::io::detail {
Expand All @@ -23,7 +24,17 @@ namespace cudf::io::detail {
* @brief Returns the value of the environment variable, or a default value if the variable is not
* present.
*/
std::string getenv_or(std::string const& env_var_name, std::string_view default_val);
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
template <typename T>
T getenv_or(std::string_view env_var_name, T default_val)
{
auto const env_val = std::getenv(env_var_name.data());
if (env_val == nullptr) { return default_val; }

std::stringstream sstream(env_val);
T converted_val;
sstream >> converted_val;
return converted_val;
}

namespace cufile_integration {

Expand Down
15 changes: 9 additions & 6 deletions cpp/src/io/utilities/file_io_utilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ void cufile_shim::modify_cufile_json() const
temp_directory tmp_config_dir{"cudf_cufile_config"};

// Modify the config file based on the policy
auto const config_file_path = getenv_or(json_path_env_var, "/etc/cufile.json");
auto const config_file_path = getenv_or<std::string>(json_path_env_var, "/etc/cufile.json");
std::ifstream user_config_file(config_file_path);
// Modified config file is stored in a temporary directory
auto const cudf_config_path = tmp_config_dir.path() + "/cufile.json";
Expand Down Expand Up @@ -170,7 +170,8 @@ cufile_registered_file::~cufile_registered_file() { shim->handle_deregister(cf_h
cufile_input_impl::cufile_input_impl(std::string const& filepath)
: shim{cufile_shim::instance()},
cf_file(shim, filepath, O_RDONLY | O_DIRECT),
pool(16) // The benefit from multithreaded read plateaus around 16 threads
// The benefit from multithreaded read plateaus around 16 threads
pool(getenv_or("LIBCUDF_CUFILE_THREAD_COUNT", 16))
{
pool.sleep_duration = 10;
}
Expand All @@ -194,9 +195,11 @@ std::vector<std::future<ResultT>> make_sliced_tasks(
F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool)
{
std::vector<std::future<ResultT>> slice_tasks;
constexpr size_t max_slice_bytes = 4 * 1024 * 1024;
size_t const n_slices = util::div_rounding_up_safe(size, max_slice_bytes);
size_t slice_offset = 0;
constexpr size_t default_max_slice_bytes = 4 * 1024 * 1024;
static auto const max_slice_bytes =
getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_bytes);
size_t const n_slices = util::div_rounding_up_safe(size, max_slice_bytes);
size_t slice_offset = 0;
for (size_t t = 0; t < n_slices; ++t) {
DataT* ptr_slice = ptr + slice_offset;

Expand Down Expand Up @@ -250,7 +253,7 @@ size_t cufile_input_impl::read(size_t offset,
cufile_output_impl::cufile_output_impl(std::string const& filepath)
: shim{cufile_shim::instance()},
cf_file(shim, filepath, O_CREAT | O_RDWR | O_DIRECT, 0664),
pool(16)
pool(getenv_or("LIBCUDF_CUFILE_THREAD_COUNT", 16))
{
}

Expand Down
6 changes: 6 additions & 0 deletions docs/cudf/source/basics/io-gds-integration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,9 @@ Operations that support the use of GPUDirect Storage:
- `to_csv`
- `to_parquet`
- `to_orc`

Several parameters that can be used to tune the performance of GDS-enabled I/O are exposed through environment variables:

- ``LIBCUDF_CUFILE_THREAD_COUNT``: Integral value, maximum number of parallel reads/writes per file (default 16);
- ``LIBCUDF_CUFILE_SLICE_SIZE``: Integral value, maximum size of each GDS read/write, in bytes (default 4MB).
Larger I/O operations are split into multiple calls.