From 466a90d5c6febff34872395cfcdecceff8513a7f Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 27 Sep 2022 12:21:50 -0400 Subject: [PATCH 1/6] Document that minimum required CMake version is now 3.23.1 (#11751) With rapids-cmake now requiring CMake 3.23.1 update consumers to correctly express this requirement Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Nghia Truong (https://github.com/ttnghia) - GALI PREM SAGAR (https://github.com/galipremsagar) - Ray Douglass (https://github.com/raydouglass) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/11751 --- CONTRIBUTING.md | 2 +- conda/environments/cudf_dev_cuda11.5.yml | 2 +- conda/recipes/cudf/conda_build_config.yaml | 2 +- conda/recipes/cudf_kafka/meta.yaml | 2 +- conda/recipes/libcudf/conda_build_config.yaml | 2 +- conda/recipes/strings_udf/conda_build_config.yaml | 2 +- cpp/CMakeLists.txt | 2 +- cpp/libcudf_kafka/CMakeLists.txt | 2 +- java/src/main/native/CMakeLists.txt | 2 +- python/cudf/CMakeLists.txt | 2 +- python/cudf/pyproject.toml | 2 +- python/strings_udf/CMakeLists.txt | 2 +- python/strings_udf/cpp/CMakeLists.txt | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 38a57caa5f7..6eb621abcc3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -73,7 +73,7 @@ Compilers: * `gcc` version 9.3+ * `nvcc` version 11.5+ -* `cmake` version 3.20.1+ +* `cmake` version 3.23.1+ CUDA/GPU: diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml index 973ca731853..2b4cbdec97b 100644 --- a/conda/environments/cudf_dev_cuda11.5.yml +++ b/conda/environments/cudf_dev_cuda11.5.yml @@ -14,7 +14,7 @@ dependencies: - clang-tools=11.1.0 - cupy>=9.5.0,<12.0.0a0 - rmm=22.10.* - - cmake>=3.20.1,!=3.23.0 + - cmake>=3.23.1 - cmake_setuptools>=0.1.3 - scikit-build>=0.13.1 - python>=3.8,<3.10 diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml index d9c3f21448f..0027a80f1ec 100644 --- a/conda/recipes/cudf/conda_build_config.yaml +++ b/conda/recipes/cudf/conda_build_config.yaml @@ -8,7 +8,7 @@ sysroot_version: - "2.17" cmake_version: - - ">=3.20.1,!=3.23.0" + - ">=3.23.1" cuda_compiler: - nvcc diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index df9cfa7c3c5..a65373efec3 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -22,7 +22,7 @@ build: requirements: build: - - cmake >=3.20.1,!=3.23.0 + - cmake >=3.23.1 - {{ compiler('c') }} - {{ compiler('cxx') }} - sysroot_{{ target_platform }} {{ sysroot_version }} diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 4cf672997d3..7f5bf219f1f 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -11,7 +11,7 @@ sysroot_version: - "2.17" cmake_version: - - ">=3.20.1,!=3.23.0" + - ">=3.23.1" gtest_version: - "=1.10.0" diff --git a/conda/recipes/strings_udf/conda_build_config.yaml b/conda/recipes/strings_udf/conda_build_config.yaml index d9c3f21448f..0027a80f1ec 100644 --- a/conda/recipes/strings_udf/conda_build_config.yaml +++ b/conda/recipes/strings_udf/conda_build_config.yaml @@ -8,7 +8,7 @@ sysroot_version: - "2.17" cmake_version: - - ">=3.20.1,!=3.23.0" + - ">=3.23.1" cuda_compiler: - nvcc diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 96fc75adcff..c84589af345 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(../fetch_rapids.cmake) include(rapids-cmake) diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 76a012e7c6e..f355fa01c28 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -11,7 +11,7 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(../../fetch_rapids.cmake) include(rapids-cmake) diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 9410f8eacf3..fca797fbccd 100755 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -11,7 +11,7 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) include(../../../../fetch_rapids.cmake) include(rapids-cmake) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index 72e1779401f..9762eacbbed 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) set(cudf_version 22.10.00) diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index ab00b37e4fa..52490444dba 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -7,6 +7,6 @@ requires = [ "setuptools", "cython>=0.29,<0.30", "scikit-build>=0.13.1", - "cmake>=3.20.1,!=3.23.0", + "cmake>=3.23.1", "ninja", ] diff --git a/python/strings_udf/CMakeLists.txt b/python/strings_udf/CMakeLists.txt index 59d8ae795f2..53d31575363 100644 --- a/python/strings_udf/CMakeLists.txt +++ b/python/strings_udf/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1 FATAL_ERROR) +cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR) set(strings_udf_version 22.10.00) diff --git a/python/strings_udf/cpp/CMakeLists.txt b/python/strings_udf/cpp/CMakeLists.txt index d157acfefde..5bbb6ae4791 100644 --- a/python/strings_udf/cpp/CMakeLists.txt +++ b/python/strings_udf/cpp/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -cmake_minimum_required(VERSION 3.20.1) +cmake_minimum_required(VERSION 3.23.1) include(rapids-cmake) include(rapids-cpm) From d8feede5d6520695a53bc2c54ef69226f7825260 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 27 Sep 2022 14:23:30 -0500 Subject: [PATCH 2/6] Reduce code duplication for `dask` & `distributed` nightly/stable installs (#11565) After https://github.com/dask/dask/issues/9367 was fixed in dask upstream we had to bump the minimum version of dask to 2022.8.0 to correctly fetch nightly(if channel exists) or stable (if `dask/dev` label doesn't exist). Without this fix, conda builds were always picking up `2022.7.1` only and/or there would be a mix of nightly & stable packages in an env. This PR also does some cleanup and makes the `build.sh` script easy to maintain. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Charles Blackmon-Luca (https://github.com/charlesbluca) URL: https://github.com/rapidsai/cudf/pull/11565 --- ci/benchmark/build.sh | 7 +++++-- ci/gpu/build.sh | 10 ++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh index e9ea4630133..34fcbb6e104 100755 --- a/ci/benchmark/build.sh +++ b/ci/benchmark/build.sh @@ -39,6 +39,9 @@ export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache" # Dask & Distributed option to install main(nightly) or `conda-forge` packages. export INSTALL_DASK_MAIN=1 +# Dask version to install when `INSTALL_DASK_MAIN=0` +export DASK_STABLE_VERSION="2022.7.1" + function remove_libcudf_kernel_cache_dir { EXITCODE=$? logger "removing kernel cache dir: $LIBCUDF_KERNEL_CACHE_PATH" @@ -82,8 +85,8 @@ if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then gpuci_logger "gpuci_mamba_retry update dask" gpuci_mamba_retry update dask else - gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.7.1 conda-forge::distributed>=2022.7.1 conda-forge::dask-core>=2022.7.1 --force-reinstall" - gpuci_mamba_retry install conda-forge::dask>=2022.7.1 conda-forge::distributed>=2022.7.1 conda-forge::dask-core>=2022.7.1 --force-reinstall + gpuci_logger "gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall" + gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall fi # Install the master version of streamz diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 118bdb263af..c2a2287f69a 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -34,6 +34,9 @@ unset GIT_DESCRIBE_TAG # Dask & Distributed option to install main(nightly) or `conda-forge` packages. export INSTALL_DASK_MAIN=1 +# Dask version to install when `INSTALL_DASK_MAIN=0` +export DASK_STABLE_VERSION="2022.7.1" + # ucx-py version export UCX_PY_VERSION='0.28.*' @@ -91,8 +94,8 @@ function install_dask { gpuci_mamba_retry update dask conda list else - gpuci_logger "gpuci_mamba_retry install conda-forge::dask>=2022.7.1 conda-forge::distributed>=2022.7.1 conda-forge::dask-core>=2022.7.1 --force-reinstall" - gpuci_mamba_retry install conda-forge::dask>=2022.7.1 conda-forge::distributed>=2022.7.1 conda-forge::dask-core>=2022.7.1 --force-reinstall + gpuci_logger "gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall" + gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall fi # Install the main version of streamz gpuci_logger "Install the main version of streamz" @@ -188,6 +191,9 @@ else # copied by CI from the upstream 11.5 jobs into $CONDA_ARTIFACT_PATH gpuci_logger "Installing cudf, dask-cudf, cudf_kafka, and custreamz" gpuci_mamba_retry install cudf dask-cudf cudf_kafka custreamz -c "${CONDA_BLD_DIR}" -c "${CONDA_ARTIFACT_PATH}" + + gpuci_logger "Check current conda environment" + conda list --show-channel-urls gpuci_logger "GoogleTests" # Run libcudf and libcudf_kafka gtests from libcudf-tests package From 4005a7fa002c8304ffd806a6b6b084af9b774129 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 27 Sep 2022 14:39:10 -0500 Subject: [PATCH 3/6] Build `cudf` locally before building `strings_udf` conda packages in CI (#11785) Authors: - https://github.com/brandon-b-miller Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/11785 --- ci/cpu/build.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh index a931546292e..87d378f4d65 100755 --- a/ci/cpu/build.sh +++ b/ci/cpu/build.sh @@ -83,10 +83,17 @@ if [ "$BUILD_LIBCUDF" == '1' ]; then # BUILD_LIBCUDF == 1 means this job is being run on the cpu_build jobs # that is where we must also build the strings_udf package + mkdir -p ${CONDA_BLD_DIR}/strings_udf/work + STRINGS_UDF_BUILD_DIR=${CONDA_BLD_DIR}/strings_udf/work + gpuci_logger "Build conda pkg for cudf (python 3.8), for strings_udf" + gpuci_conda_retry mambabuild --no-build-id --croot ${STRINGS_UDF_BUILD_DIR} -c ${CONDA_BLD_DIR} conda/recipes/cudf ${CONDA_BUILD_ARGS} --python=3.8 + gpuci_logger "Build conda pkg for cudf (python 3.9), for strings_udf" + gpuci_conda_retry mambabuild --no-build-id --croot ${STRINGS_UDF_BUILD_DIR} -c ${CONDA_BLD_DIR} conda/recipes/cudf ${CONDA_BUILD_ARGS} --python=3.9 + gpuci_logger "Build conda pkg for strings_udf (python 3.8)" - gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/strings_udf $CONDA_BUILD_ARGS --python=3.8 + gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} -c ${STRINGS_UDF_BUILD_DIR} -c ${CONDA_BLD_DIR} conda/recipes/strings_udf $CONDA_BUILD_ARGS --python=3.8 gpuci_logger "Build conda pkg for strings_udf (python 3.9)" - gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} conda/recipes/strings_udf $CONDA_BUILD_ARGS --python=3.9 + gpuci_conda_retry mambabuild --no-build-id --croot ${CONDA_BLD_DIR} -c ${STRINGS_UDF_BUILD_DIR} -c ${CONDA_BLD_DIR} conda/recipes/strings_udf $CONDA_BUILD_ARGS --python=3.9 mkdir -p ${CONDA_BLD_DIR}/libcudf/work cp -r ${CONDA_BLD_DIR}/work/* ${CONDA_BLD_DIR}/libcudf/work From bcf361fda974bda0d28b73ec00d63cc851c77308 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 27 Sep 2022 15:56:42 -0500 Subject: [PATCH 4/6] Expose "explicit-comms" option in shuffle-based dask_cudf functions (#11576) This PR exposes an option to use Dask-CUDA's explicit-comms shuffle for the primary shuffle-based `dask_cudf.DataFrame` methods: `shuffle`, `sort_values`, and `set_index`. Although "explicit-comms" is still experimental, the explicit-shuffle algorithm is known to consistently outperform the "task"-based shuffle. As far as I can tell, it is not currently possible to use an "explicit-comms" shuffle in `dask_cudf` without directly importing the function from Dask-CUDA (@madsbk - please do correct me if I am mistaken). In order to simplify benchmarking, and to utilize the optimized shuffle within high-cardinality groupby code, I propose that we make it easier to access the explicit shuffle. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Benjamin Zaitlen (https://github.com/quasiben) URL: https://github.com/rapidsai/cudf/pull/11576 --- python/dask_cudf/dask_cudf/core.py | 49 ++++++++--------- python/dask_cudf/dask_cudf/sorting.py | 39 +++++++++++++- .../dask_cudf/tests/test_distributed.py | 53 +++++++++++++++++++ 3 files changed, 111 insertions(+), 30 deletions(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 2641ef68379..0bf39df313a 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -29,6 +29,7 @@ from dask_cudf import sorting from dask_cudf.accessors import ListMethods, StructMethods +from dask_cudf.sorting import _get_shuffle_type DASK_VERSION = LooseVersion(dask.__version__) @@ -133,25 +134,16 @@ def do_apply_rows(df, func, incols, outcols, kwargs): ) @_dask_cudf_nvtx_annotate - def merge(self, other, **kwargs): - if kwargs.pop("shuffle", "tasks") != "tasks": - raise ValueError( - "Dask-cudf only supports task based shuffling, got %s" - % kwargs["shuffle"] - ) + def merge(self, other, shuffle=None, **kwargs): on = kwargs.pop("on", None) if isinstance(on, tuple): on = list(on) - return super().merge(other, on=on, shuffle="tasks", **kwargs) + return super().merge( + other, on=on, shuffle=_get_shuffle_type(shuffle), **kwargs + ) @_dask_cudf_nvtx_annotate - def join(self, other, **kwargs): - if kwargs.pop("shuffle", "tasks") != "tasks": - raise ValueError( - "Dask-cudf only supports task based shuffling, got %s" - % kwargs["shuffle"] - ) - + def join(self, other, shuffle=None, **kwargs): # CuDF doesn't support "right" join yet how = kwargs.pop("how", "left") if how == "right": @@ -160,15 +152,15 @@ def join(self, other, **kwargs): on = kwargs.pop("on", None) if isinstance(on, tuple): on = list(on) - return super().join(other, how=how, on=on, shuffle="tasks", **kwargs) + return super().join( + other, how=how, on=on, shuffle=_get_shuffle_type(shuffle), **kwargs + ) @_dask_cudf_nvtx_annotate - def set_index(self, other, sorted=False, divisions=None, **kwargs): - if kwargs.pop("shuffle", "tasks") != "tasks": - raise ValueError( - "Dask-cudf only supports task based shuffling, got %s" - % kwargs["shuffle"] - ) + def set_index( + self, other, sorted=False, divisions=None, shuffle=None, **kwargs + ): + pre_sorted = sorted del sorted @@ -196,13 +188,13 @@ def set_index(self, other, sorted=False, divisions=None, **kwargs): divisions = None # Use dask_cudf's sort_values - # TODO: Handle `sorted=True` df = self.sort_values( by, max_branch=kwargs.get("max_branch", None), divisions=divisions, set_divisions=True, ignore_index=True, + shuffle=shuffle, ) # Ignore divisions if its a dataframe @@ -229,7 +221,7 @@ def set_index(self, other, sorted=False, divisions=None, **kwargs): return super().set_index( other, sorted=pre_sorted, - shuffle="tasks", + shuffle=_get_shuffle_type(shuffle), divisions=divisions, **kwargs, ) @@ -246,6 +238,7 @@ def sort_values( na_position="last", sort_function=None, sort_function_kwargs=None, + shuffle=None, **kwargs, ): if kwargs: @@ -262,6 +255,7 @@ def sort_values( ignore_index=ignore_index, ascending=ascending, na_position=na_position, + shuffle=shuffle, sort_function=sort_function, sort_function_kwargs=sort_function_kwargs, ) @@ -338,12 +332,11 @@ def repartition(self, *args, **kwargs): return super().repartition(*args, **kwargs) @_dask_cudf_nvtx_annotate - def shuffle(self, *args, **kwargs): + def shuffle(self, *args, shuffle=None, **kwargs): """Wraps dask.dataframe DataFrame.shuffle method""" - shuffle_arg = kwargs.pop("shuffle", None) - if shuffle_arg and shuffle_arg != "tasks": - raise ValueError("dask_cudf does not support disk-based shuffle.") - return super().shuffle(*args, shuffle="tasks", **kwargs) + return super().shuffle( + *args, shuffle=_get_shuffle_type(shuffle), **kwargs + ) @_dask_cudf_nvtx_annotate def groupby(self, by=None, **kwargs): diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index 1c89baba592..a359920229d 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -6,10 +6,10 @@ import numpy as np import tlz as toolz +import dask from dask.base import tokenize from dask.dataframe import methods from dask.dataframe.core import DataFrame, Index, Series -from dask.dataframe.shuffle import rearrange_by_column from dask.highlevelgraph import HighLevelGraph from dask.utils import M @@ -231,10 +231,18 @@ def sort_values( ignore_index=False, ascending=True, na_position="last", + shuffle=None, sort_function=None, sort_function_kwargs=None, ): """Sort by the given list/tuple of column names.""" + + shuffle = _get_shuffle_type(shuffle) + # Note that we cannot import `rearrange_by_column` in + # the header, because we need to allow dask-cuda to + # patch this function before we import it here + from dask.dataframe.shuffle import rearrange_by_column + if not isinstance(ascending, bool): raise ValueError("ascending must be either True or False") if na_position not in ("first", "last"): @@ -285,7 +293,7 @@ def sort_values( "_partitions", max_branch=max_branch, npartitions=len(divisions) - 1, - shuffle="tasks", + shuffle=shuffle, ignore_index=ignore_index, ).drop(columns=["_partitions"]) df3.divisions = (None,) * (df3.npartitions + 1) @@ -297,3 +305,30 @@ def sort_values( df4.divisions = tuple(methods.tolist(divisions)) return df4 + + +def _get_shuffle_type(shuffle): + # Utility to set the shuffle-kwarg default + # and to validate user-specified options + # + # Supported Options: + # - "tasks" + # - "explicit-comms" (requires dask_cuda) + # + shuffle = shuffle or dask.config.get("shuffle", "tasks") + if shuffle not in {"tasks", "explicit-comms"}: + raise ValueError( + f"Dask-cudf only supports in-memory shuffling with " + f"'tasks' or 'explicit-comms'. Got shuffle={shuffle}" + ) + + if shuffle == "explicit-comms": + try: + import dask_cuda # noqa: F401 + except ImportError: + raise ValueError( + "shuffle='explicit-comms' requires dask_cuda. " + "Please install dask_cuda, or use shuffle='tasks'." + ) + + return shuffle diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py index e24feaa2ea4..6f7f720f29c 100644 --- a/python/dask_cudf/dask_cudf/tests/test_distributed.py +++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py @@ -6,6 +6,7 @@ import dask from dask import dataframe as dd from dask.distributed import Client +from dask.utils_test import hlg_layer from distributed.utils_test import cleanup, loop, loop_in_thread # noqa: F401 import cudf @@ -77,3 +78,55 @@ def test_str_series_roundtrip(): actual = dask_series.compute() assert_eq(actual, expected) + + +def test_shuffle_explicit_comms(): + with dask_cuda.LocalCUDACluster(n_workers=2) as cluster: + with Client(cluster): + df = cudf.DataFrame({"a": [1, 2, 3, 4], "b": [3, 1, 2, 4]}) + ddf = dask_cudf.from_cudf(df, npartitions=4) + + # Test sort_values API + got_ec = ddf.sort_values(["b"], shuffle="explicit-comms") + got_tasks = ddf.sort_values(["b"], shuffle="tasks") + assert hlg_layer(got_ec.dask, "explicit") + assert_eq(got_ec.compute(), got_tasks.compute()) + + # Test set_index API + got_ec = ddf.set_index("b", shuffle="explicit-comms") + got_tasks = ddf.set_index("b", shuffle="tasks") + assert got_ec.divisions == got_tasks.divisions + assert hlg_layer(got_ec.dask, "explicit") + assert_eq(got_ec.compute(), got_tasks.compute()) + + # Test shuffle API + got_ec = ddf.shuffle(["b"], shuffle="explicit-comms") + assert hlg_layer(got_ec.dask, "explicit") + assert len(got_ec) == len(ddf) + + # Test merge API + got_ec = ddf.merge(ddf.copy(), on="b", shuffle="explicit-comms") + got_tasks = ddf.merge(ddf.copy(), on="b", shuffle="tasks") + assert hlg_layer(got_ec.dask, "explicit") + assert_eq(got_ec.compute(), got_tasks.compute()) + + # Test join API + got_ec = ddf.join( + ddf.set_index("b"), + on="b", + lsuffix="_l", + rsuffix="_r", + shuffle="explicit-comms", + ) + got_tasks = ddf.join( + ddf.set_index("b"), + on="b", + lsuffix="_l", + rsuffix="_r", + shuffle="tasks", + ) + assert hlg_layer(got_ec.dask, "explicit") + assert_eq( + got_ec.compute().sort_index(), + got_tasks.compute().sort_index(), + ) From 1003e33fba724c00d0eed906a37b8df4b4509b06 Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 27 Sep 2022 16:17:31 -0500 Subject: [PATCH 5/6] Add docs for use of string data to `DataFrame.apply` and `Series.apply` and update guide to UDFs notebook (#11733) This PR updates some docstrings around cuDF to show some examples of how to use strings inside UDFs, as well as provide some caveats. It also adds a section with some detail and examples to our guide to udfs ipython notebook. Authors: - https://github.com/brandon-b-miller Approvers: - Ashwin Srinath (https://github.com/shwina) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/11733 --- .../source/user_guide/guide-to-udfs.ipynb | 343 ++++++++++++++---- python/cudf/cudf/core/dataframe.py | 46 +++ python/cudf/cudf/core/series.py | 43 +++ 3 files changed, 367 insertions(+), 65 deletions(-) diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb index ef7500a2be9..32a51224668 100644 --- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb +++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb @@ -27,7 +27,7 @@ "source": [ "Like many tabular data processing APIs, cuDF provides a range of composable, DataFrame style operators. While out of the box functions are flexible and useful, it is sometimes necessary to write custom code, or user-defined functions (UDFs), that can be applied to rows, columns, and other groupings of the cells making up the DataFrame.\n", "\n", - "In conjunction with the broader GPU PyData ecosystem, cuDF provides interfaces to run UDFs on a variety of data structures. Currently, we can only execute UDFs on numeric, boolean, datetime, and timedelta typed data (support for strings is being planned). This guide covers writing and executing UDFs on the following data structures:\n", + "In conjunction with the broader GPU PyData ecosystem, cuDF provides interfaces to run UDFs on a variety of data structures. Currently, we can only execute UDFs on numeric, boolean, datetime, and timedelta typed data with partial support for strings in some APIs. This guide covers writing and executing UDFs on the following data structures:\n", "\n", "- Series\n", "- DataFrame\n", @@ -328,6 +328,91 @@ "In addition, `cudf.NA` can be returned from a function directly or conditionally. This capability should allow you to implement custom null handling in a wide variety of cases." ] }, + { + "cell_type": "markdown", + "id": "cc7c7e67", + "metadata": {}, + "source": [ + "### String data" + ] + }, + { + "cell_type": "markdown", + "id": "c0980218", + "metadata": {}, + "source": [ + "Support for a subset of string functionality is available for `apply` through the [strings_udf](https://anaconda.org/rapidsai-nightly/strings_udf) package, which is separately installable. Currently, the following string operations are provided through `strings_udf`:\",\n", + "- `str.count`\n", + "- `str.startswith`\n", + "- `str.endswith`\n", + "- `str.find`\n", + "- `str.rfind`\n", + "- `str.isalnum`\n", + "- `str.isdecimal`\n", + "- `str.isdigit`\n", + "- `str.islower`\n", + "- `str.isupper`\n", + "- `str.isalpha`\n", + "- `str.istitle`\n", + "- `str.isspace`\n", + "- `==`, `!=`, `>=`, `<=`, `>`, `<` (between two strings)\n", + "- `len` (e.g. `len(some_string))`\n", + "- `__contains__` (e.g, `'abc' in some_string`)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d7d1abd7", + "metadata": {}, + "outputs": [], + "source": [ + "sr = cudf.Series(['', 'abc', 'some_example'])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e8538ba0", + "metadata": {}, + "outputs": [], + "source": [ + "def f(st):\n", + " if len(st) > 0:\n", + " if st.startswith('a'):\n", + " return 1\n", + " elif 'example' in st:\n", + " return 2\n", + " else:\n", + " return -1\n", + " else:\n", + " return 42" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "23524fd8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 42\n", + "1 1\n", + "2 2\n", + "dtype: int64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sr.apply(f)" + ] + }, { "cell_type": "markdown", "id": "54cafbc0", @@ -349,7 +434,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "id": "732434f6", "metadata": {}, "outputs": [], @@ -359,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "id": "4f5997e5", "metadata": {}, "outputs": [], @@ -385,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "id": "ea6008a6", "metadata": {}, "outputs": [], @@ -405,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "id": "183a82ed", "metadata": {}, "outputs": [ @@ -485,7 +570,7 @@ "4 979 982 1011 9790.0" ] }, - "execution_count": 15, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -534,7 +619,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "id": "73653918", "metadata": {}, "outputs": [], @@ -553,7 +638,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "id": "077feb75", "metadata": {}, "outputs": [ @@ -609,7 +694,7 @@ "2 3 6" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -632,7 +717,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "091e39e1", "metadata": {}, "outputs": [ @@ -645,7 +730,7 @@ "dtype: int64" ] }, - "execution_count": 18, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -664,7 +749,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "id": "bd345fab", "metadata": {}, "outputs": [ @@ -677,7 +762,7 @@ "dtype: object" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -704,7 +789,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "id": "b70f4b3b", "metadata": {}, "outputs": [ @@ -756,7 +841,7 @@ "2 3" ] }, - "execution_count": 20, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -775,7 +860,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "id": "0313c8df", "metadata": {}, "outputs": [ @@ -788,7 +873,7 @@ "dtype: int64" ] }, - "execution_count": 21, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -807,7 +892,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "id": "96a7952a", "metadata": {}, "outputs": [ @@ -863,7 +948,7 @@ "2 3 1" ] }, - "execution_count": 22, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -886,7 +971,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "id": "e0815f60", "metadata": {}, "outputs": [ @@ -899,7 +984,7 @@ "dtype: int64" ] }, - "execution_count": 23, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -918,7 +1003,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "id": "495efd14", "metadata": {}, "outputs": [ @@ -974,7 +1059,7 @@ "2 3 3.14" ] }, - "execution_count": 24, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -992,7 +1077,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "id": "678b0b5a", "metadata": {}, "outputs": [ @@ -1005,7 +1090,7 @@ "dtype: float64" ] }, - "execution_count": 25, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1037,7 +1122,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 29, "id": "acf48d56", "metadata": {}, "outputs": [ @@ -1089,7 +1174,7 @@ "2 5" ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1110,7 +1195,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "id": "78a98172", "metadata": {}, "outputs": [ @@ -1123,7 +1208,7 @@ "dtype: float64" ] }, - "execution_count": 27, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1142,7 +1227,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 31, "id": "142c30a9", "metadata": {}, "outputs": [ @@ -1210,7 +1295,7 @@ "2 3 6 4 8 6" ] }, - "execution_count": 28, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1231,7 +1316,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 32, "id": "fee9198a", "metadata": {}, "outputs": [ @@ -1244,7 +1329,7 @@ "dtype: float64" ] }, - "execution_count": 29, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1253,6 +1338,134 @@ "df.apply(f, axis=1)" ] }, + { + "cell_type": "markdown", + "id": "71e30d33", + "metadata": {}, + "source": [ + "### String Data" + ] + }, + { + "cell_type": "markdown", + "id": "1a3694ea", + "metadata": {}, + "source": [ + "String data may be used inside `DataFrame.apply` UDFs, subject to the same constraints as those for `Series.apply`. See the section on string handling for `Series` UDFs above for details. Below is a simple example extending the row UDF logic from above in the case of a string column:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "cccd59f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
str_colscale
0abc1
1ABC2
2Example3
\n", + "
" + ], + "text/plain": [ + " str_col scale\n", + "0 abc 1\n", + "1 ABC 2\n", + "2 Example 3" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str_df = cudf.DataFrame({\n", + " 'str_col': ['abc', 'ABC', 'Example'],\n", + " 'scale': [1, 2, 3]\n", + "})\n", + "str_df" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "35737fd9", + "metadata": {}, + "outputs": [], + "source": [ + "def f(row):\n", + " st = row['str_col']\n", + " scale = row['scale']\n", + " \n", + " if len(st) > 5:\n", + " return len(st) + scale\n", + " else:\n", + " return len(st)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "4ede4d5b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 3\n", + "1 3\n", + "2 10\n", + "dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str_df.apply(f, axis=1)" + ] + }, { "cell_type": "markdown", "id": "9c587bd2", @@ -1273,7 +1486,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 36, "id": "90cbcd85", "metadata": {}, "outputs": [], @@ -1304,7 +1517,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 37, "id": "e782daff", "metadata": {}, "outputs": [ @@ -1376,7 +1589,7 @@ "2 3 6 4 8 6 9.0" ] }, - "execution_count": 31, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -1410,7 +1623,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 38, "id": "befd8333", "metadata": {}, "outputs": [ @@ -1484,7 +1697,7 @@ "4 979 982 1011" ] }, - "execution_count": 32, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -1511,7 +1724,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 39, "id": "d1f3dcaf", "metadata": {}, "outputs": [ @@ -1591,7 +1804,7 @@ "4 979 982 1011 1961.0" ] }, - "execution_count": 33, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -1626,7 +1839,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 40, "id": "6bc6aea3", "metadata": {}, "outputs": [ @@ -1642,7 +1855,7 @@ "dtype: float64" ] }, - "execution_count": 34, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -1654,7 +1867,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 41, "id": "a4c31df1", "metadata": {}, "outputs": [ @@ -1664,7 +1877,7 @@ "Rolling [window=3,min_periods=3,center=False]" ] }, - "execution_count": 35, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -1684,7 +1897,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 42, "id": "eb5a081b", "metadata": {}, "outputs": [], @@ -1710,7 +1923,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 43, "id": "ddec3263", "metadata": {}, "outputs": [ @@ -1726,7 +1939,7 @@ "dtype: float64" ] }, - "execution_count": 37, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -1745,7 +1958,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 44, "id": "8b61094a", "metadata": {}, "outputs": [ @@ -1813,7 +2026,7 @@ "4 59.0 59.0" ] }, - "execution_count": 38, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -1827,7 +2040,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 45, "id": "bb8c3019", "metadata": {}, "outputs": [ @@ -1925,7 +2138,7 @@ "9 100.0 100.0" ] }, - "execution_count": 39, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -1949,7 +2162,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 46, "id": "3dc272ab", "metadata": {}, "outputs": [ @@ -2029,7 +2242,7 @@ "4 -0.970850 False Sarah 0.342905" ] }, - "execution_count": 40, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -2041,7 +2254,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 47, "id": "c0578e0a", "metadata": {}, "outputs": [], @@ -2059,7 +2272,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 48, "id": "19f0f7fe", "metadata": {}, "outputs": [], @@ -2088,7 +2301,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 49, "id": "c43426c3", "metadata": {}, "outputs": [ @@ -2219,7 +2432,7 @@ "9 -0.725581 True George 0.405245 0.271319" ] }, - "execution_count": 43, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -2251,7 +2464,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 50, "id": "aa6a8509", "metadata": {}, "outputs": [ @@ -2261,7 +2474,7 @@ "array([ 1., 2., 3., 4., 10.])" ] }, - "execution_count": 44, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -2284,7 +2497,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 51, "id": "0bb8bf93", "metadata": {}, "outputs": [ @@ -2299,7 +2512,7 @@ "dtype: int32" ] }, - "execution_count": 45, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -2326,7 +2539,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 52, "id": "ce60b639", "metadata": {}, "outputs": [ @@ -2336,7 +2549,7 @@ "array([ 5., 10., 15., 20., 50.])" ] }, - "execution_count": 46, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -2360,7 +2573,7 @@ "id": "fe7eb68b", "metadata": {}, "source": [ - "- Only numeric nondecimal scalar types are currently supported as of yet, but strings and structured types are in planning. Attempting to use this API with those types will throw a `TypeError`.\n", + "- UDFs are currently only supported for numeric nondecimal scalar types (full support) and strings in `Series.apply` and `DataFrame.apply` (partial support, subject to the caveats outlined above). Attempting to use this API with unsupported types will raise a `TypeError`.\n", "- We do not yet fully support all arithmetic operators. Certain ops like bitwise operations are not currently implemented, but planned in future releases. If an operator is needed, a github issue should be raised so that it can be properly prioritized and implemented." ] }, @@ -2402,7 +2615,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.13" } }, "nbformat": 4, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index cac37f1f274..f00c7d1f2b5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3955,6 +3955,12 @@ def apply( For more information, see the `cuDF guide to user defined functions `__. + Support for use of string data within UDFs is provided through the + `strings_udf `__ + RAPIDS library. Supported operations on strings include the subset of + functions and string methods that expect an input string but do not + return a string. Refer to caveats in the UDF guide referenced above. + Parameters ---------- func : function @@ -4078,6 +4084,46 @@ def apply( 1 4.8 2 5.0 dtype: float64 + + UDFs manipulating string data are allowed, as long as + they neither modify strings in place nor create new strings. + For example, the following UDF is allowed: + + >>> def f(row): + ... st = row['str_col'] + ... scale = row['scale'] + ... if len(st) == 0: + ... return -1 + ... elif st.startswith('a'): + ... return 1 - scale + ... elif 'example' in st: + ... return 1 + scale + ... else: + ... return 42 + ... + >>> df = cudf.DataFrame({ + ... 'str_col': ['', 'abc', 'some_example'], + ... 'scale': [1, 2, 3] + ... }) + >>> df.apply(f, axis=1) # doctest: +SKIP + 0 -1 + 1 -1 + 2 4 + dtype: int64 + + However, the following UDF is not allowed since it includes an + operation that requires the creation of a new string: a call to the + ``upper`` method. Methods that are not supported in this manner + will raise an ``AttributeError``. + + >>> def f(row): + ... st = row['str_col'].upper() + ... return 'ABC' in st + >>> df.apply(f, axis=1) # doctest: +SKIP + + For a complete list of supported functions and methods that may be + used to manipulate string data, see the the UDF guide, + """ if axis != 1: raise ValueError( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e94ca8d653d..f11052096e3 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2251,6 +2251,12 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs): For more information, see the `cuDF guide to user defined functions `__. + Support for use of string data within UDFs is provided through the + `strings_udf `__ + RAPIDS library. Supported operations on strings include the subset of + functions and string methods that expect an input string but do not + return a string. Refer to caveats in the UDF guide referenced above. + Parameters ---------- func : function @@ -2332,6 +2338,43 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs): 1 2 4.5 dtype: float64 + + UDFs manipulating string data are allowed, as long as + they neither modify strings in place nor create new strings. + For example, the following UDF is allowed: + + >>> def f(st): + ... if len(st) == 0: + ... return -1 + ... elif st.startswith('a'): + ... return 1 + ... elif 'example' in st: + ... return 2 + ... else: + ... return 3 + ... + >>> sr = cudf.Series(['', 'abc', 'some_example']) + >>> sr.apply(f) # doctest: +SKIP + 0 -1 + 1 1 + 2 2 + dtype: int64 + + However, the following UDF is not allowed since it includes an + operation that requires the creation of a new string: a call to the + ``upper`` method. Methods that are not supported in this manner + will raise an ``AttributeError``. + + >>> def f(st): + ... new = st.upper() + ... return 'ABC' in new + ... + >>> sr.apply(f) # doctest: +SKIP + + For a complete list of supported functions and methods that may be + used to manipulate string data, see the the UDF guide, + + """ if convert_dtype is not True: raise ValueError("Series.apply only supports convert_dtype=True") From 5a416a02feca89da1e9f32e30a7fc501d1a2d34d Mon Sep 17 00:00:00 2001 From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com> Date: Tue, 27 Sep 2022 16:27:10 -0500 Subject: [PATCH 6/6] Fix an issue in cudf::row_bit_count involving structs and lists at multiple levels. (#11779) `row_bit_count` keeps track of a stack of "branches" which represent a span of rows to be included in the computed size. As you traverse through a hierarchy of lists, that span of rows is maintained as a stack. The code that was handling jumping out from the bottom of a stack to a new column was making the faulty assumption that the jump was only 1 level up. Authors: - https://github.com/nvdbaranec Approvers: - Nghia Truong (https://github.com/ttnghia) - Mike Wilson (https://github.com/hyperbolic2346) - Alessandro Bellina (https://github.com/abellina) URL: https://github.com/rapidsai/cudf/pull/11779 --- cpp/src/transform/row_bit_count.cu | 10 +++--- cpp/tests/transform/row_bit_count_test.cu | 38 +++++++++++++++++++++++ 2 files changed, 44 insertions(+), 4 deletions(-) diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu index e60e9f95440..9545b5289f9 100644 --- a/cpp/src/transform/row_bit_count.cu +++ b/cpp/src/transform/row_bit_count.cu @@ -409,7 +409,8 @@ __global__ void compute_row_sizes(device_span cols, auto const num_rows = output.size(); if (tid >= num_rows) { return; } - // branch stack. points to the last list prior to branching. + // my_branch_stack points to the last span prior to branching. a branch occurs only + // when we are inside of a list contained within a struct column. row_span* my_branch_stack = thread_branch_stacks + (threadIdx.x * max_branch_depth); size_type branch_depth{0}; @@ -424,11 +425,12 @@ __global__ void compute_row_sizes(device_span cols, for (size_type idx = 0; idx < cols.size(); idx++) { column_device_view const& col = cols[idx]; - // if we've returned from a branch + // if we've returned from a branch, pop to the proper span if (info[idx].branch_depth_start < last_branch_depth) { - cur_span = my_branch_stack[--branch_depth]; + branch_depth = info[idx].branch_depth_start; + cur_span = my_branch_stack[branch_depth]; } - // if we're entering a new branch. + // if we're entering a new branch, push the current span // NOTE: this case can happen (a pop and a push by the same column) // when we have a struct if (info[idx].branch_depth_end > info[idx].branch_depth_start) { diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu index 61c2fa12895..8151e0d6d8d 100644 --- a/cpp/tests/transform/row_bit_count_test.cu +++ b/cpp/tests/transform/row_bit_count_test.cu @@ -618,9 +618,47 @@ TEST_F(RowBitCount, Table) thrust::make_counting_iterator(0) + t.num_rows(), mcv.begin(), sum_functor{cv0.data(), cv1.data(), cv2.data()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result); } +TEST_F(RowBitCount, DepthJump) +{ + // jump more than 1 branch depth. + + using T = int; + + // struct>, int> + // the jump occurs from depth 2 (the leafmost int column) + // to depth 0 (the topmost int column) + cudf::test::fixed_width_column_wrapper ____c0{1, 2, 3, 5, 5, 6, 7, 8}; + cudf::test::fixed_width_column_wrapper ___offsets{0, 2, 4, 6, 8}; + auto ___c0 = cudf::make_lists_column(4, ___offsets.release(), ____c0.release(), 0, {}); + std::vector> __children; + __children.push_back(std::move(___c0)); + cudf::test::structs_column_wrapper __c0(std::move(__children)); + cudf::test::fixed_width_column_wrapper _offsets{0, 3, 4}; + auto _c0 = cudf::make_lists_column(2, _offsets.release(), __c0.release(), 0, {}); + cudf::test::fixed_width_column_wrapper _c1{3, 4}; + std::vector> children; + children.push_back(std::move(_c0)); + children.push_back(_c1.release()); + cudf::test::structs_column_wrapper c0(std::move(children)); + + table_view t({c0}); + auto result = cudf::row_bit_count(t); + + // expected size = (num rows at level 1 + num_rows at level 2) + (# values the leaf int column) + + // 1 (value in topmost int column) + constexpr size_type offset_size = sizeof(cudf::offset_type) * CHAR_BIT; + constexpr size_type type_size = sizeof(T) * CHAR_BIT; + cudf::test::fixed_width_column_wrapper expected{ + ((1 + 3) * offset_size) + (6 * type_size) + (1 * type_size), + ((1 + 1) * offset_size) + (2 * type_size) + (1 * type_size)}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + TEST_F(RowBitCount, SlicedColumnsFixedWidth) { auto const slice_size = 7;