Skip to content

Commit

Permalink
Add compile option to enable large strings support (#16037)
Browse files Browse the repository at this point in the history
Adds `CUDF_LARGE_STRINGS_DISABLED` compile-time option to disable large strings support.
The default is to now enable large strings support with this PR.

This changes the default behavior of the `LIBCUDF_LARGE_STRINGS_ENABLED` environment variable -- when the variable is not set. If the environment variable is not set, then the default behavior depends on the compile option.
If `CUDF_LARGE_STRINGS_DISABLED` is compiled `ON` then setting `LIBCUDF_LARGE_STRINGS_ENABLED=1` will turn it **on** at runtime.
If `CUDF_LARGE_STRINGS_DISABLED` is not compiled on then setting `LIBCUDF_LARGE_STRINGS_ENABLED=0` will turn it **off** at runtime.

This PR also sets `CUDF_LARGE_STRINGS_DISABLED=OFF` by default in the `build.sh`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Jason Lowe (https://github.com/jlowe)
  - Bradley Dice (https://github.com/bdice)

URL: #16037
  • Loading branch information
davidwendt authored Jul 2, 2024
1 parent 31ed9fd commit 3bd9975
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 12 deletions.
9 changes: 8 additions & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ARGS=$*
# script, and that this script resides in the repo dir!
REPODIR=$(cd $(dirname $0); pwd)

VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats"
VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
clean - remove all existing build artifacts and configuration (start
over)
Expand All @@ -39,6 +39,7 @@ HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [li
--opensource_nvcomp - disable use of proprietary nvcomp extensions
--show_depr_warn - show cmake deprecation warnings
--ptds - enable per-thread default stream
--disable_large_strings - disable large strings support
--build_metrics - generate build metrics report for libcudf
--incl_cache_stats - include cache statistics in build metrics report
--cmake-args=\\\"<args>\\\" - pass arbitrary list of CMake configuration options (escape all quotes in argument)
Expand Down Expand Up @@ -69,6 +70,7 @@ BUILD_DISABLE_DEPRECATION_WARNINGS=ON
BUILD_PER_THREAD_DEFAULT_STREAM=OFF
BUILD_REPORT_METRICS=OFF
BUILD_REPORT_INCL_CACHE_STATS=OFF
BUILD_DISABLE_LARGE_STRINGS=OFF
USE_PROPRIETARY_NVCOMP=ON
PYTHON_ARGS_FOR_INSTALL="-m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true"

Expand Down Expand Up @@ -153,6 +155,7 @@ function buildLibCudfJniInDocker {
-DCUDF_ENABLE_ARROW_S3=OFF \
-DBUILD_TESTS=OFF \
-DCUDF_USE_PER_THREAD_DEFAULT_STREAM=ON \
-DCUDF_LARGE_STRINGS_DISABLED=ON \
-DRMM_LOGGING_LEVEL=OFF \
-DBUILD_SHARED_LIBS=OFF && \
cmake --build . --parallel ${PARALLEL_LEVEL} && \
Expand Down Expand Up @@ -239,6 +242,9 @@ if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUDF_CPP"* ]]; then
EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_CUDF_CPP=ON"
fi

if hasArg --disable_large_strings; then
BUILD_DISABLE_LARGE_STRINGS="ON"
fi

# If clean given, run it prior to any other steps
if hasArg clean; then
Expand Down Expand Up @@ -292,6 +298,7 @@ if buildAll || hasArg libcudf; then
-DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \
-DDISABLE_DEPRECATION_WARNINGS=${BUILD_DISABLE_DEPRECATION_WARNINGS} \
-DCUDF_USE_PER_THREAD_DEFAULT_STREAM=${BUILD_PER_THREAD_DEFAULT_STREAM} \
-DCUDF_LARGE_STRINGS_DISABLED=${BUILD_DISABLE_LARGE_STRINGS} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
${EXTRA_CMAKE_ARGS}

Expand Down
3 changes: 3 additions & 0 deletions ci/test_java.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ EXITCODE=0
trap "EXITCODE=1" ERR
set +e

# disable large strings
export LIBCUDF_LARGE_STRINGS_ENABLED=0

rapids-logger "Run Java tests"
pushd java
mvn test -B -DCUDF_JNI_ENABLE_PROFILING=OFF
Expand Down
7 changes: 7 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON)
mark_as_advanced(CUDF_BUILD_TESTUTIL)
option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON)
option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF)
mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED)
option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF)
option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF)
option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF)
Expand Down Expand Up @@ -783,6 +785,11 @@ if(NOT USE_NVTX)
target_compile_definitions(cudf PUBLIC NVTX_DISABLE)
endif()

# Disable large strings support
if(CUDF_LARGE_STRINGS_DISABLED)
target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED)
endif()

# Define RMM logging level
target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL")

Expand Down
5 changes: 5 additions & 0 deletions cpp/src/strings/utilities.cu
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,13 @@ int64_t get_offset64_threshold()

bool is_large_strings_enabled()
{
// default depends on compile-time switch but can be overridden by the environment variable
auto const env = std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED");
#ifdef CUDF_LARGE_STRINGS_DISABLED
return env != nullptr && std::string(env) == "1";
#else
return env == nullptr || std::string(env) == "1";
#endif
}

int64_t get_offset_value(cudf::column_view const& offsets,
Expand Down
11 changes: 0 additions & 11 deletions python/cudf/cudf/tests/test_column.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,17 +515,6 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
np.testing.assert_array_equal(expect_mask, got_mask)


def test_concatenate_large_column_strings():
num_strings = 1_000_000
string_scale_f = 100

s_1 = cudf.Series(["very long string " * string_scale_f] * num_strings)
s_2 = cudf.Series(["very long string " * string_scale_f] * num_strings)

with pytest.raises(OverflowError):
cudf.concat([s_1, s_2])


@pytest.mark.parametrize(
"alias,expect_dtype",
[
Expand Down

0 comments on commit 3bd9975

Please sign in to comment.