From 3bd9975e867c9d2a077ed50fa339cecfd9bc8d9b Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 2 Jul 2024 15:20:03 -0400 Subject: [PATCH] Add compile option to enable large strings support (#16037) Adds `CUDF_LARGE_STRINGS_DISABLED` compile-time option to disable large strings support. The default is to now enable large strings support with this PR. This changes the default behavior of the `LIBCUDF_LARGE_STRINGS_ENABLED` environment variable -- when the variable is not set. If the environment variable is not set, then the default behavior depends on the compile option. If `CUDF_LARGE_STRINGS_DISABLED` is compiled `ON` then setting `LIBCUDF_LARGE_STRINGS_ENABLED=1` will turn it **on** at runtime. If `CUDF_LARGE_STRINGS_DISABLED` is not compiled on then setting `LIBCUDF_LARGE_STRINGS_ENABLED=0` will turn it **off** at runtime. This PR also sets `CUDF_LARGE_STRINGS_DISABLED=OFF` by default in the `build.sh` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Robert Maynard (https://github.com/robertmaynard) - Vyas Ramasubramani (https://github.com/vyasr) - Jason Lowe (https://github.com/jlowe) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/16037 --- build.sh | 9 ++++++++- ci/test_java.sh | 3 +++ cpp/CMakeLists.txt | 7 +++++++ cpp/src/strings/utilities.cu | 5 +++++ python/cudf/cudf/tests/test_column.py | 11 ----------- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/build.sh b/build.sh index 4291c88ea12..52bb1e64d16 100755 --- a/build.sh +++ b/build.sh @@ -17,7 +17,7 @@ ARGS=$* # script, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats" +VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings" HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"\\\"] clean - remove all existing build artifacts and configuration (start over) @@ -39,6 +39,7 @@ HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [li --opensource_nvcomp - disable use of proprietary nvcomp extensions --show_depr_warn - show cmake deprecation warnings --ptds - enable per-thread default stream + --disable_large_strings - disable large strings support --build_metrics - generate build metrics report for libcudf --incl_cache_stats - include cache statistics in build metrics report --cmake-args=\\\"\\\" - pass arbitrary list of CMake configuration options (escape all quotes in argument) @@ -69,6 +70,7 @@ BUILD_DISABLE_DEPRECATION_WARNINGS=ON BUILD_PER_THREAD_DEFAULT_STREAM=OFF BUILD_REPORT_METRICS=OFF BUILD_REPORT_INCL_CACHE_STATS=OFF +BUILD_DISABLE_LARGE_STRINGS=OFF USE_PROPRIETARY_NVCOMP=ON PYTHON_ARGS_FOR_INSTALL="-m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true" @@ -153,6 +155,7 @@ function buildLibCudfJniInDocker { -DCUDF_ENABLE_ARROW_S3=OFF \ -DBUILD_TESTS=OFF \ -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=ON \ + -DCUDF_LARGE_STRINGS_DISABLED=ON \ -DRMM_LOGGING_LEVEL=OFF \ -DBUILD_SHARED_LIBS=OFF && \ cmake --build . --parallel ${PARALLEL_LEVEL} && \ @@ -239,6 +242,9 @@ if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUDF_CPP"* ]]; then EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_CUDF_CPP=ON" fi +if hasArg --disable_large_strings; then + BUILD_DISABLE_LARGE_STRINGS="ON" +fi # If clean given, run it prior to any other steps if hasArg clean; then @@ -292,6 +298,7 @@ if buildAll || hasArg libcudf; then -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \ -DDISABLE_DEPRECATION_WARNINGS=${BUILD_DISABLE_DEPRECATION_WARNINGS} \ -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=${BUILD_PER_THREAD_DEFAULT_STREAM} \ + -DCUDF_LARGE_STRINGS_DISABLED=${BUILD_DISABLE_LARGE_STRINGS} \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ ${EXTRA_CMAKE_ARGS} diff --git a/ci/test_java.sh b/ci/test_java.sh index 9713eb192d2..629ad11014a 100755 --- a/ci/test_java.sh +++ b/ci/test_java.sh @@ -39,6 +39,9 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e +# disable large strings +export LIBCUDF_LARGE_STRINGS_ENABLED=0 + rapids-logger "Run Java tests" pushd java mvn test -B -DCUDF_JNI_ENABLE_PROFILING=OFF diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 54070ab6f5a..2811711d58c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -52,6 +52,8 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON) option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON) mark_as_advanced(CUDF_BUILD_TESTUTIL) option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON) +option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF) +mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED) option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF) option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF) option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF) @@ -783,6 +785,11 @@ if(NOT USE_NVTX) target_compile_definitions(cudf PUBLIC NVTX_DISABLE) endif() +# Disable large strings support +if(CUDF_LARGE_STRINGS_DISABLED) + target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED) +endif() + # Define RMM logging level target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL") diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 101004a5d06..f70598f33be 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -158,8 +158,13 @@ int64_t get_offset64_threshold() bool is_large_strings_enabled() { + // default depends on compile-time switch but can be overridden by the environment variable auto const env = std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED"); +#ifdef CUDF_LARGE_STRINGS_DISABLED return env != nullptr && std::string(env) == "1"; +#else + return env == nullptr || std::string(env) == "1"; +#endif } int64_t get_offset_value(cudf::column_view const& offsets, diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index ea919c786b9..c288155112c 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -515,17 +515,6 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): np.testing.assert_array_equal(expect_mask, got_mask) -def test_concatenate_large_column_strings(): - num_strings = 1_000_000 - string_scale_f = 100 - - s_1 = cudf.Series(["very long string " * string_scale_f] * num_strings) - s_2 = cudf.Series(["very long string " * string_scale_f] * num_strings) - - with pytest.raises(OverflowError): - cudf.concat([s_1, s_2]) - - @pytest.mark.parametrize( "alias,expect_dtype", [