diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 38a57caa5f7..6eb621abcc3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -73,7 +73,7 @@ Compilers: * `gcc` version 9.3+ * `nvcc` version 11.5+ -* `cmake` version 3.20.1+ +* `cmake` version 3.23.1+ CUDA/GPU: diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index e9ea4630133..34fcbb6e104 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -39,6 +39,9 @@ export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" # Dask & Distributed option to install main(nightly) or `conda-forge` packages. export INSTALL_DASK_MAIN=1 +# Dask version to install when `INSTALL_DASK_MAIN=0` +export DASK_STABLE_VERSION="2022.7.1" + function remove_libcudf_kernel_cache_dir { EXITCODE=$? logger "removing kernel cache dir: $LIBCUDF_KERNEL_CACHE_PATH" @@ -82,8 +85,8 @@ if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then gpuci_logger "gpuci_mamba_retry update dask" gpuci_mamba_retry update dask else - gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.7.1 conda-forge::distributed>=2022.7.1 conda-forge::dask-core>=2022.7.1 --force-reinstall" - gpuci_mamba_retry install conda-forge::dask>=2022.7.1 conda-forge::distributed>=2022.7.1 conda-forge::dask-core>=2022.7.1 --force-reinstall + gpuci_logger "gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall" + gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall fi # Install the master version of streamz diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index a931546292e..87d378f4d65 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -83,10 +83,17 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then # BUILD_LIBCUDF == 1 means this job is being run on the cpu_build jobs # that is where we must also build the strings_udf package + mkdir -p ${CONDA_BLD_DIR}/strings_udf/work + STRINGS_UDF_BUILD_DIR=${CONDA_BLD_DIR}/strings_udf/work + gpuci_logger "Build conda pkg for cudf (python 3.8), for strings_udf" + gpuci_conda_retry mambabuild --no-build-id --croot ${STRINGS_UDF_BUILD_DIR} -c ${CONDA_BLD_DIR} conda/recipes/cudf ${CONDA_BUILD_ARGS} --python=3.8 + gpuci_logger "Build conda pkg for cudf (python 3.9), for strings_udf" + gpuci_conda_retry mambabuild --no-build-id --croot ${STRINGS_UDF_BUILD_DIR} -c ${CONDA_BLD_DIR} conda/recipes/cudf ${CONDA_BUILD_ARGS} --python=3.9 + gpuci_logger "Build conda pkg for strings_udf (python 3.8)" - gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/strings_udf $CONDA_BUILD_ARGS --python=3.8 + gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} -c ${STRINGS_UDF_BUILD_DIR} -c ${CONDA_BLD_DIR} conda/recipes/strings_udf $CONDA_BUILD_ARGS --python=3.8 gpuci_logger "Build conda pkg for strings_udf (python 3.9)" - gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/strings_udf $CONDA_BUILD_ARGS --python=3.9 + gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} -c ${STRINGS_UDF_BUILD_DIR} -c ${CONDA_BLD_DIR} conda/recipes/strings_udf $CONDA_BUILD_ARGS --python=3.9 mkdir -p ${CONDA_BLD_DIR}/libcudf/work cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcudf/work diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index f3c302173c8..117ca6f4e42 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -34,6 +34,9 @@ unset GIT_DESCRIBE_TAG # Dask & Distributed option to install main(nightly) or `conda-forge` packages. export INSTALL_DASK_MAIN=1 +# Dask version to install when `INSTALL_DASK_MAIN=0` +export DASK_STABLE_VERSION="2022.7.1" + # ucx-py version export UCX_PY_VERSION='0.29.*' @@ -91,8 +94,8 @@ function install_dask { gpuci_mamba_retry update dask conda list else - gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.7.1 conda-forge::distributed>=2022.7.1 conda-forge::dask-core>=2022.7.1 --force-reinstall" - gpuci_mamba_retry install conda-forge::dask>=2022.7.1 conda-forge::distributed>=2022.7.1 conda-forge::dask-core>=2022.7.1 --force-reinstall + gpuci_logger "gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall" + gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall fi # Install the main version of streamz gpuci_logger "Install the main version of streamz" @@ -188,6 +191,9 @@ else # copied by CI from the upstream 11.5 jobs into $CONDA_ARTIFACT_PATH gpuci_logger "Installing cudf, dask-cudf, cudf_kafka, and custreamz" gpuci_mamba_retry install cudf dask-cudf cudf_kafka custreamz -c "${CONDA_BLD_DIR}" -c "${CONDA_ARTIFACT_PATH}" + + gpuci_logger "Check current conda environment" + conda list --show-channel-urls gpuci_logger "GoogleTests" # Run libcudf and libcudf_kafka gtests from libcudf-tests package diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index c3e41927a05..ce63b68424f 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -14,7 +14,7 @@ dependencies: - clang-tools=11.1.0 - cupy>=9.5.0,<12.0.0a0 - rmm=22.12.* - - cmake>=3.20.1,!=3.23.0 + - cmake>=3.23.1 - cmake_setuptools>=0.1.3 - scikit-build>=0.13.1 - python>=3.8,<3.10 diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml index d9c3f21448f..0027a80f1ec 100644 --- a/conda/recipes/cudf/conda_build_config.yaml +++ b/conda/recipes/cudf/conda_build_config.yaml @@ -8,7 +8,7 @@ sysroot_version: - "2.17" cmake_version: - - ">=3.20.1,!=3.23.0" + - ">=3.23.1" cuda_compiler: - nvcc diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index df9cfa7c3c5..a65373efec3 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -22,7 +22,7 @@ build: requirements: build: - - cmake >=3.20.1,!=3.23.0 + - cmake >=3.23.1 - {{ compiler('c') }} - {{ compiler('cxx') }} - sysroot_{{ target_platform }} {{ sysroot_version }} diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 4cf672997d3..7f5bf219f1f 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -11,7 +11,7 @@ sysroot_version: - "2.17" cmake_version: - - ">=3.20.1,!=3.23.0" + - ">=3.23.1" gtest_version: - "=1.10.0" diff --git a/conda/recipes/strings_udf/conda_build_config.yaml b/conda/recipes/strings_udf/conda_build_config.yaml index d9c3f21448f..0027a80f1ec 100644 --- a/conda/recipes/strings_udf/conda_build_config.yaml +++ b/conda/recipes/strings_udf/conda_build_config.yaml @@ -8,7 +8,7 @@ sysroot_version: - "2.17" cmake_version: - - ">=3.20.1,!=3.23.0" + - ">=3.23.1" cuda_compiler: - nvcc diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 97a4205bed1..60e914f07d3 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(../fetch_rapids.cmake) include(rapids-cmake) diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 71341277109..da3b6b8af62 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -11,7 +11,7 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(../../fetch_rapids.cmake) include(rapids-cmake) diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index e60e9f95440..9545b5289f9 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -409,7 +409,8 @@ __global__ void compute_row_sizes(device_span cols, auto const num_rows = output.size(); if (tid >= num_rows) { return; } - // branch stack. points to the last list prior to branching. + // my_branch_stack points to the last span prior to branching. a branch occurs only + // when we are inside of a list contained within a struct column. row_span* my_branch_stack = thread_branch_stacks + (threadIdx.x * max_branch_depth); size_type branch_depth{0}; @@ -424,11 +425,12 @@ __global__ void compute_row_sizes(device_span cols, for (size_type idx = 0; idx < cols.size(); idx++) { column_device_view const& col = cols[idx]; - // if we've returned from a branch + // if we've returned from a branch, pop to the proper span if (info[idx].branch_depth_start < last_branch_depth) { - cur_span = my_branch_stack[--branch_depth]; + branch_depth = info[idx].branch_depth_start; + cur_span = my_branch_stack[branch_depth]; } - // if we're entering a new branch. + // if we're entering a new branch, push the current span // NOTE: this case can happen (a pop and a push by the same column) // when we have a struct if (info[idx].branch_depth_end > info[idx].branch_depth_start) { diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu index 61c2fa12895..8151e0d6d8d 100644 --- a/cpp/tests/transform/row_bit_count_test.cu +++ b/cpp/tests/transform/row_bit_count_test.cu @@ -618,9 +618,47 @@ TEST_F(RowBitCount, Table) thrust::make_counting_iterator(0) + t.num_rows(), mcv.begin(), sum_functor{cv0.data(), cv1.data(), cv2.data()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result); } +TEST_F(RowBitCount, DepthJump) +{ + // jump more than 1 branch depth. + + using T = int; + + // struct>, int> + // the jump occurs from depth 2 (the leafmost int column) + // to depth 0 (the topmost int column) + cudf::test::fixed_width_column_wrapper ____c0{1, 2, 3, 5, 5, 6, 7, 8}; + cudf::test::fixed_width_column_wrapper ___offsets{0, 2, 4, 6, 8}; + auto ___c0 = cudf::make_lists_column(4, ___offsets.release(), ____c0.release(), 0, {}); + std::vector> __children; + __children.push_back(std::move(___c0)); + cudf::test::structs_column_wrapper __c0(std::move(__children)); + cudf::test::fixed_width_column_wrapper _offsets{0, 3, 4}; + auto _c0 = cudf::make_lists_column(2, _offsets.release(), __c0.release(), 0, {}); + cudf::test::fixed_width_column_wrapper _c1{3, 4}; + std::vector> children; + children.push_back(std::move(_c0)); + children.push_back(_c1.release()); + cudf::test::structs_column_wrapper c0(std::move(children)); + + table_view t({c0}); + auto result = cudf::row_bit_count(t); + + // expected size = (num rows at level 1 + num_rows at level 2) + (# values the leaf int column) + + // 1 (value in topmost int column) + constexpr size_type offset_size = sizeof(cudf::offset_type) * CHAR_BIT; + constexpr size_type type_size = sizeof(T) * CHAR_BIT; + cudf::test::fixed_width_column_wrapper expected{ + ((1 + 3) * offset_size) + (6 * type_size) + (1 * type_size), + ((1 + 1) * offset_size) + (2 * type_size) + (1 * type_size)}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + TEST_F(RowBitCount, SlicedColumnsFixedWidth) { auto const slice_size = 7; diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb index ef7500a2be9..32a51224668 100644 --- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb +++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb @@ -27,7 +27,7 @@ "source": [ "Like many tabular data processing APIs, cuDF provides a range of composable, DataFrame style operators. While out of the box functions are flexible and useful, it is sometimes necessary to write custom code, or user-defined functions (UDFs), that can be applied to rows, columns, and other groupings of the cells making up the DataFrame.\n", "\n", - "In conjunction with the broader GPU PyData ecosystem, cuDF provides interfaces to run UDFs on a variety of data structures. Currently, we can only execute UDFs on numeric, boolean, datetime, and timedelta typed data (support for strings is being planned). This guide covers writing and executing UDFs on the following data structures:\n", + "In conjunction with the broader GPU PyData ecosystem, cuDF provides interfaces to run UDFs on a variety of data structures. Currently, we can only execute UDFs on numeric, boolean, datetime, and timedelta typed data with partial support for strings in some APIs. This guide covers writing and executing UDFs on the following data structures:\n", "\n", "- Series\n", "- DataFrame\n", @@ -328,6 +328,91 @@ "In addition, `cudf.NA` can be returned from a function directly or conditionally. This capability should allow you to implement custom null handling in a wide variety of cases." ] }, + { + "cell_type": "markdown", + "id": "cc7c7e67", + "metadata": {}, + "source": [ + "### String data" + ] + }, + { + "cell_type": "markdown", + "id": "c0980218", + "metadata": {}, + "source": [ + "Support for a subset of string functionality is available for `apply` through the [strings_udf](https://anaconda.org/rapidsai-nightly/strings_udf) package, which is separately installable. Currently, the following string operations are provided through `strings_udf`:\",\n", + "- `str.count`\n", + "- `str.startswith`\n", + "- `str.endswith`\n", + "- `str.find`\n", + "- `str.rfind`\n", + "- `str.isalnum`\n", + "- `str.isdecimal`\n", + "- `str.isdigit`\n", + "- `str.islower`\n", + "- `str.isupper`\n", + "- `str.isalpha`\n", + "- `str.istitle`\n", + "- `str.isspace`\n", + "- `==`, `!=`, `>=`, `<=`, `>`, `<` (between two strings)\n", + "- `len` (e.g. `len(some_string))`\n", + "- `__contains__` (e.g, `'abc' in some_string`)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d7d1abd7", + "metadata": {}, + "outputs": [], + "source": [ + "sr = cudf.Series(['', 'abc', 'some_example'])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e8538ba0", + "metadata": {}, + "outputs": [], + "source": [ + "def f(st):\n", + " if len(st) > 0:\n", + " if st.startswith('a'):\n", + " return 1\n", + " elif 'example' in st:\n", + " return 2\n", + " else:\n", + " return -1\n", + " else:\n", + " return 42" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "23524fd8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 42\n", + "1 1\n", + "2 2\n", + "dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sr.apply(f)" + ] + }, { "cell_type": "markdown", "id": "54cafbc0", @@ -349,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "id": "732434f6", "metadata": {}, "outputs": [], @@ -359,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "id": "4f5997e5", "metadata": {}, "outputs": [], @@ -385,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "id": "ea6008a6", "metadata": {}, "outputs": [], @@ -405,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "id": "183a82ed", "metadata": {}, "outputs": [ @@ -485,7 +570,7 @@ "4 979 982 1011 9790.0" ] }, - "execution_count": 15, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -534,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "id": "73653918", "metadata": {}, "outputs": [], @@ -553,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "id": "077feb75", "metadata": {}, "outputs": [ @@ -609,7 +694,7 @@ "2 3 6" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -632,7 +717,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "091e39e1", "metadata": {}, "outputs": [ @@ -645,7 +730,7 @@ "dtype: int64" ] }, - "execution_count": 18, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -664,7 +749,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "id": "bd345fab", "metadata": {}, "outputs": [ @@ -677,7 +762,7 @@ "dtype: object" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -704,7 +789,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "id": "b70f4b3b", "metadata": {}, "outputs": [ @@ -756,7 +841,7 @@ "2 3" ] }, - "execution_count": 20, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -775,7 +860,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "id": "0313c8df", "metadata": {}, "outputs": [ @@ -788,7 +873,7 @@ "dtype: int64" ] }, - "execution_count": 21, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -807,7 +892,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "id": "96a7952a", "metadata": {}, "outputs": [ @@ -863,7 +948,7 @@ "2 3 1" ] }, - "execution_count": 22, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -886,7 +971,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "id": "e0815f60", "metadata": {}, "outputs": [ @@ -899,7 +984,7 @@ "dtype: int64" ] }, - "execution_count": 23, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -918,7 +1003,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "id": "495efd14", "metadata": {}, "outputs": [ @@ -974,7 +1059,7 @@ "2 3 3.14" ] }, - "execution_count": 24, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -992,7 +1077,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "id": "678b0b5a", "metadata": {}, "outputs": [ @@ -1005,7 +1090,7 @@ "dtype: float64" ] }, - "execution_count": 25, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1037,7 +1122,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 29, "id": "acf48d56", "metadata": {}, "outputs": [ @@ -1089,7 +1174,7 @@ "2 5" ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1110,7 +1195,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "id": "78a98172", "metadata": {}, "outputs": [ @@ -1123,7 +1208,7 @@ "dtype: float64" ] }, - "execution_count": 27, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1142,7 +1227,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 31, "id": "142c30a9", "metadata": {}, "outputs": [ @@ -1210,7 +1295,7 @@ "2 3 6 4 8 6" ] }, - "execution_count": 28, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1231,7 +1316,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 32, "id": "fee9198a", "metadata": {}, "outputs": [ @@ -1244,7 +1329,7 @@ "dtype: float64" ] }, - "execution_count": 29, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1253,6 +1338,134 @@ "df.apply(f, axis=1)" ] }, + { + "cell_type": "markdown", + "id": "71e30d33", + "metadata": {}, + "source": [ + "### String Data" + ] + }, + { + "cell_type": "markdown", + "id": "1a3694ea", + "metadata": {}, + "source": [ + "String data may be used inside `DataFrame.apply` UDFs, subject to the same constraints as those for `Series.apply`. See the section on string handling for `Series` UDFs above for details. Below is a simple example extending the row UDF logic from above in the case of a string column:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "cccd59f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
str_colscale
0abc1
1ABC2
2Example3
\n", + "
" + ], + "text/plain": [ + " str_col scale\n", + "0 abc 1\n", + "1 ABC 2\n", + "2 Example 3" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str_df = cudf.DataFrame({\n", + " 'str_col': ['abc', 'ABC', 'Example'],\n", + " 'scale': [1, 2, 3]\n", + "})\n", + "str_df" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "35737fd9", + "metadata": {}, + "outputs": [], + "source": [ + "def f(row):\n", + " st = row['str_col']\n", + " scale = row['scale']\n", + " \n", + " if len(st) > 5:\n", + " return len(st) + scale\n", + " else:\n", + " return len(st)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "4ede4d5b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 3\n", + "1 3\n", + "2 10\n", + "dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str_df.apply(f, axis=1)" + ] + }, { "cell_type": "markdown", "id": "9c587bd2", @@ -1273,7 +1486,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 36, "id": "90cbcd85", "metadata": {}, "outputs": [], @@ -1304,7 +1517,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 37, "id": "e782daff", "metadata": {}, "outputs": [ @@ -1376,7 +1589,7 @@ "2 3 6 4 8 6 9.0" ] }, - "execution_count": 31, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -1410,7 +1623,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 38, "id": "befd8333", "metadata": {}, "outputs": [ @@ -1484,7 +1697,7 @@ "4 979 982 1011" ] }, - "execution_count": 32, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -1511,7 +1724,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 39, "id": "d1f3dcaf", "metadata": {}, "outputs": [ @@ -1591,7 +1804,7 @@ "4 979 982 1011 1961.0" ] }, - "execution_count": 33, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -1626,7 +1839,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 40, "id": "6bc6aea3", "metadata": {}, "outputs": [ @@ -1642,7 +1855,7 @@ "dtype: float64" ] }, - "execution_count": 34, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -1654,7 +1867,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 41, "id": "a4c31df1", "metadata": {}, "outputs": [ @@ -1664,7 +1877,7 @@ "Rolling [window=3,min_periods=3,center=False]" ] }, - "execution_count": 35, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -1684,7 +1897,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 42, "id": "eb5a081b", "metadata": {}, "outputs": [], @@ -1710,7 +1923,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 43, "id": "ddec3263", "metadata": {}, "outputs": [ @@ -1726,7 +1939,7 @@ "dtype: float64" ] }, - "execution_count": 37, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -1745,7 +1958,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 44, "id": "8b61094a", "metadata": {}, "outputs": [ @@ -1813,7 +2026,7 @@ "4 59.0 59.0" ] }, - "execution_count": 38, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -1827,7 +2040,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 45, "id": "bb8c3019", "metadata": {}, "outputs": [ @@ -1925,7 +2138,7 @@ "9 100.0 100.0" ] }, - "execution_count": 39, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -1949,7 +2162,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 46, "id": "3dc272ab", "metadata": {}, "outputs": [ @@ -2029,7 +2242,7 @@ "4 -0.970850 False Sarah 0.342905" ] }, - "execution_count": 40, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -2041,7 +2254,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 47, "id": "c0578e0a", "metadata": {}, "outputs": [], @@ -2059,7 +2272,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 48, "id": "19f0f7fe", "metadata": {}, "outputs": [], @@ -2088,7 +2301,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 49, "id": "c43426c3", "metadata": {}, "outputs": [ @@ -2219,7 +2432,7 @@ "9 -0.725581 True George 0.405245 0.271319" ] }, - "execution_count": 43, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -2251,7 +2464,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 50, "id": "aa6a8509", "metadata": {}, "outputs": [ @@ -2261,7 +2474,7 @@ "array([ 1., 2., 3., 4., 10.])" ] }, - "execution_count": 44, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -2284,7 +2497,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 51, "id": "0bb8bf93", "metadata": {}, "outputs": [ @@ -2299,7 +2512,7 @@ "dtype: int32" ] }, - "execution_count": 45, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -2326,7 +2539,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 52, "id": "ce60b639", "metadata": {}, "outputs": [ @@ -2336,7 +2549,7 @@ "array([ 5., 10., 15., 20., 50.])" ] }, - "execution_count": 46, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -2360,7 +2573,7 @@ "id": "fe7eb68b", "metadata": {}, "source": [ - "- Only numeric nondecimal scalar types are currently supported as of yet, but strings and structured types are in planning. Attempting to use this API with those types will throw a `TypeError`.\n", + "- UDFs are currently only supported for numeric nondecimal scalar types (full support) and strings in `Series.apply` and `DataFrame.apply` (partial support, subject to the caveats outlined above). Attempting to use this API with unsupported types will raise a `TypeError`.\n", "- We do not yet fully support all arithmetic operators. Certain ops like bitwise operations are not currently implemented, but planned in future releases. If an operator is needed, a github issue should be raised so that it can be properly prioritized and implemented." ] }, @@ -2402,7 +2615,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.13" } }, "nbformat": 4, diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 26923927378..339f0f439a0 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -11,7 +11,7 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(../../../../fetch_rapids.cmake) include(rapids-cmake) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 6dc0f1800e0..1cea23669e9 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) set(cudf_version 22.12.00) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index cac37f1f274..f00c7d1f2b5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3955,6 +3955,12 @@ def apply( For more information, see the `cuDF guide to user defined functions `__. + Support for use of string data within UDFs is provided through the + `strings_udf `__ + RAPIDS library. Supported operations on strings include the subset of + functions and string methods that expect an input string but do not + return a string. Refer to caveats in the UDF guide referenced above. + Parameters ---------- func : function @@ -4078,6 +4084,46 @@ def apply( 1 4.8 2 5.0 dtype: float64 + + UDFs manipulating string data are allowed, as long as + they neither modify strings in place nor create new strings. + For example, the following UDF is allowed: + + >>> def f(row): + ... st = row['str_col'] + ... scale = row['scale'] + ... if len(st) == 0: + ... return -1 + ... elif st.startswith('a'): + ... return 1 - scale + ... elif 'example' in st: + ... return 1 + scale + ... else: + ... return 42 + ... + >>> df = cudf.DataFrame({ + ... 'str_col': ['', 'abc', 'some_example'], + ... 'scale': [1, 2, 3] + ... }) + >>> df.apply(f, axis=1) # doctest: +SKIP + 0 -1 + 1 -1 + 2 4 + dtype: int64 + + However, the following UDF is not allowed since it includes an + operation that requires the creation of a new string: a call to the + ``upper`` method. Methods that are not supported in this manner + will raise an ``AttributeError``. + + >>> def f(row): + ... st = row['str_col'].upper() + ... return 'ABC' in st + >>> df.apply(f, axis=1) # doctest: +SKIP + + For a complete list of supported functions and methods that may be + used to manipulate string data, see the the UDF guide, + """ if axis != 1: raise ValueError( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e94ca8d653d..f11052096e3 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2251,6 +2251,12 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs): For more information, see the `cuDF guide to user defined functions `__. + Support for use of string data within UDFs is provided through the + `strings_udf `__ + RAPIDS library. Supported operations on strings include the subset of + functions and string methods that expect an input string but do not + return a string. Refer to caveats in the UDF guide referenced above. + Parameters ---------- func : function @@ -2332,6 +2338,43 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs): 1 2 4.5 dtype: float64 + + UDFs manipulating string data are allowed, as long as + they neither modify strings in place nor create new strings. + For example, the following UDF is allowed: + + >>> def f(st): + ... if len(st) == 0: + ... return -1 + ... elif st.startswith('a'): + ... return 1 + ... elif 'example' in st: + ... return 2 + ... else: + ... return 3 + ... + >>> sr = cudf.Series(['', 'abc', 'some_example']) + >>> sr.apply(f) # doctest: +SKIP + 0 -1 + 1 1 + 2 2 + dtype: int64 + + However, the following UDF is not allowed since it includes an + operation that requires the creation of a new string: a call to the + ``upper`` method. Methods that are not supported in this manner + will raise an ``AttributeError``. + + >>> def f(st): + ... new = st.upper() + ... return 'ABC' in new + ... + >>> sr.apply(f) # doctest: +SKIP + + For a complete list of supported functions and methods that may be + used to manipulate string data, see the the UDF guide, + + """ if convert_dtype is not True: raise ValueError("Series.apply only supports convert_dtype=True") diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index ab00b37e4fa..52490444dba 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -7,6 +7,6 @@ requires = [ "setuptools", "cython>=0.29,<0.30", "scikit-build>=0.13.1", - "cmake>=3.20.1,!=3.23.0", + "cmake>=3.23.1", "ninja", ] diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 2641ef68379..0bf39df313a 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -29,6 +29,7 @@ from dask_cudf import sorting from dask_cudf.accessors import ListMethods, StructMethods +from dask_cudf.sorting import _get_shuffle_type DASK_VERSION = LooseVersion(dask.__version__) @@ -133,25 +134,16 @@ def do_apply_rows(df, func, incols, outcols, kwargs): ) @_dask_cudf_nvtx_annotate - def merge(self, other, **kwargs): - if kwargs.pop("shuffle", "tasks") != "tasks": - raise ValueError( - "Dask-cudf only supports task based shuffling, got %s" - % kwargs["shuffle"] - ) + def merge(self, other, shuffle=None, **kwargs): on = kwargs.pop("on", None) if isinstance(on, tuple): on = list(on) - return super().merge(other, on=on, shuffle="tasks", **kwargs) + return super().merge( + other, on=on, shuffle=_get_shuffle_type(shuffle), **kwargs + ) @_dask_cudf_nvtx_annotate - def join(self, other, **kwargs): - if kwargs.pop("shuffle", "tasks") != "tasks": - raise ValueError( - "Dask-cudf only supports task based shuffling, got %s" - % kwargs["shuffle"] - ) - + def join(self, other, shuffle=None, **kwargs): # CuDF doesn't support "right" join yet how = kwargs.pop("how", "left") if how == "right": @@ -160,15 +152,15 @@ def join(self, other, **kwargs): on = kwargs.pop("on", None) if isinstance(on, tuple): on = list(on) - return super().join(other, how=how, on=on, shuffle="tasks", **kwargs) + return super().join( + other, how=how, on=on, shuffle=_get_shuffle_type(shuffle), **kwargs + ) @_dask_cudf_nvtx_annotate - def set_index(self, other, sorted=False, divisions=None, **kwargs): - if kwargs.pop("shuffle", "tasks") != "tasks": - raise ValueError( - "Dask-cudf only supports task based shuffling, got %s" - % kwargs["shuffle"] - ) + def set_index( + self, other, sorted=False, divisions=None, shuffle=None, **kwargs + ): + pre_sorted = sorted del sorted @@ -196,13 +188,13 @@ def set_index(self, other, sorted=False, divisions=None, **kwargs): divisions = None # Use dask_cudf's sort_values - # TODO: Handle `sorted=True` df = self.sort_values( by, max_branch=kwargs.get("max_branch", None), divisions=divisions, set_divisions=True, ignore_index=True, + shuffle=shuffle, ) # Ignore divisions if its a dataframe @@ -229,7 +221,7 @@ def set_index(self, other, sorted=False, divisions=None, **kwargs): return super().set_index( other, sorted=pre_sorted, - shuffle="tasks", + shuffle=_get_shuffle_type(shuffle), divisions=divisions, **kwargs, ) @@ -246,6 +238,7 @@ def sort_values( na_position="last", sort_function=None, sort_function_kwargs=None, + shuffle=None, **kwargs, ): if kwargs: @@ -262,6 +255,7 @@ def sort_values( ignore_index=ignore_index, ascending=ascending, na_position=na_position, + shuffle=shuffle, sort_function=sort_function, sort_function_kwargs=sort_function_kwargs, ) @@ -338,12 +332,11 @@ def repartition(self, *args, **kwargs): return super().repartition(*args, **kwargs) @_dask_cudf_nvtx_annotate - def shuffle(self, *args, **kwargs): + def shuffle(self, *args, shuffle=None, **kwargs): """Wraps dask.dataframe DataFrame.shuffle method""" - shuffle_arg = kwargs.pop("shuffle", None) - if shuffle_arg and shuffle_arg != "tasks": - raise ValueError("dask_cudf does not support disk-based shuffle.") - return super().shuffle(*args, shuffle="tasks", **kwargs) + return super().shuffle( + *args, shuffle=_get_shuffle_type(shuffle), **kwargs + ) @_dask_cudf_nvtx_annotate def groupby(self, by=None, **kwargs): diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index 1c89baba592..a359920229d 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -6,10 +6,10 @@ import numpy as np import tlz as toolz +import dask from dask.base import tokenize from dask.dataframe import methods from dask.dataframe.core import DataFrame, Index, Series -from dask.dataframe.shuffle import rearrange_by_column from dask.highlevelgraph import HighLevelGraph from dask.utils import M @@ -231,10 +231,18 @@ def sort_values( ignore_index=False, ascending=True, na_position="last", + shuffle=None, sort_function=None, sort_function_kwargs=None, ): """Sort by the given list/tuple of column names.""" + + shuffle = _get_shuffle_type(shuffle) + # Note that we cannot import `rearrange_by_column` in + # the header, because we need to allow dask-cuda to + # patch this function before we import it here + from dask.dataframe.shuffle import rearrange_by_column + if not isinstance(ascending, bool): raise ValueError("ascending must be either True or False") if na_position not in ("first", "last"): @@ -285,7 +293,7 @@ def sort_values( "_partitions", max_branch=max_branch, npartitions=len(divisions) - 1, - shuffle="tasks", + shuffle=shuffle, ignore_index=ignore_index, ).drop(columns=["_partitions"]) df3.divisions = (None,) * (df3.npartitions + 1) @@ -297,3 +305,30 @@ def sort_values( df4.divisions = tuple(methods.tolist(divisions)) return df4 + + +def _get_shuffle_type(shuffle): + # Utility to set the shuffle-kwarg default + # and to validate user-specified options + # + # Supported Options: + # - "tasks" + # - "explicit-comms" (requires dask_cuda) + # + shuffle = shuffle or dask.config.get("shuffle", "tasks") + if shuffle not in {"tasks", "explicit-comms"}: + raise ValueError( + f"Dask-cudf only supports in-memory shuffling with " + f"'tasks' or 'explicit-comms'. Got shuffle={shuffle}" + ) + + if shuffle == "explicit-comms": + try: + import dask_cuda # noqa: F401 + except ImportError: + raise ValueError( + "shuffle='explicit-comms' requires dask_cuda. " + "Please install dask_cuda, or use shuffle='tasks'." + ) + + return shuffle diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index e24feaa2ea4..6f7f720f29c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -6,6 +6,7 @@ import dask from dask import dataframe as dd from dask.distributed import Client +from dask.utils_test import hlg_layer from distributed.utils_test import cleanup, loop, loop_in_thread # noqa: F401 import cudf @@ -77,3 +78,55 @@ def test_str_series_roundtrip(): actual = dask_series.compute() assert_eq(actual, expected) + + +def test_shuffle_explicit_comms(): + with dask_cuda.LocalCUDACluster(n_workers=2) as cluster: + with Client(cluster): + df = cudf.DataFrame({"a": [1, 2, 3, 4], "b": [3, 1, 2, 4]}) + ddf = dask_cudf.from_cudf(df, npartitions=4) + + # Test sort_values API + got_ec = ddf.sort_values(["b"], shuffle="explicit-comms") + got_tasks = ddf.sort_values(["b"], shuffle="tasks") + assert hlg_layer(got_ec.dask, "explicit") + assert_eq(got_ec.compute(), got_tasks.compute()) + + # Test set_index API + got_ec = ddf.set_index("b", shuffle="explicit-comms") + got_tasks = ddf.set_index("b", shuffle="tasks") + assert got_ec.divisions == got_tasks.divisions + assert hlg_layer(got_ec.dask, "explicit") + assert_eq(got_ec.compute(), got_tasks.compute()) + + # Test shuffle API + got_ec = ddf.shuffle(["b"], shuffle="explicit-comms") + assert hlg_layer(got_ec.dask, "explicit") + assert len(got_ec) == len(ddf) + + # Test merge API + got_ec = ddf.merge(ddf.copy(), on="b", shuffle="explicit-comms") + got_tasks = ddf.merge(ddf.copy(), on="b", shuffle="tasks") + assert hlg_layer(got_ec.dask, "explicit") + assert_eq(got_ec.compute(), got_tasks.compute()) + + # Test join API + got_ec = ddf.join( + ddf.set_index("b"), + on="b", + lsuffix="_l", + rsuffix="_r", + shuffle="explicit-comms", + ) + got_tasks = ddf.join( + ddf.set_index("b"), + on="b", + lsuffix="_l", + rsuffix="_r", + shuffle="tasks", + ) + assert hlg_layer(got_ec.dask, "explicit") + assert_eq( + got_ec.compute().sort_index(), + got_tasks.compute().sort_index(), + ) diff --git a/python/strings_udf/CMakeLists.txt b/python/strings_udf/CMakeLists.txt index 59d8ae795f2..53d31575363 100644 --- a/python/strings_udf/CMakeLists.txt +++ b/python/strings_udf/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) set(strings_udf_version 22.10.00) diff --git a/python/strings_udf/cpp/CMakeLists.txt b/python/strings_udf/cpp/CMakeLists.txt index d157acfefde..5bbb6ae4791 100644 --- a/python/strings_udf/cpp/CMakeLists.txt +++ b/python/strings_udf/cpp/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1) +cmake_minimum_required(VERSION 3.23.1) include(rapids-cmake) include(rapids-cpm)