From f7c35d56cdfb7af842b54255029b7481ca9b6d94 Mon Sep 17 00:00:00 2001 From: martinfalisse <45781926+martinfalisse@users.noreply.github.com> Date: Thu, 14 Apr 2022 20:27:51 +0200 Subject: [PATCH 1/7] Add support for numeric_only in DataFrame._reduce (#10629) Add support for numeric_only in DataFrame._reduce, this way can use df.mean(numeric_only=True), etc. Resolves https://github.com/rapidsai/cudf/issues/2067. Also partially addresses https://github.com/rapidsai/cudf/issues/9009. Authors: - https://github.com/martinfalisse Approvers: - Michael Wang (https://github.com/isVoid) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10629 --- python/cudf/cudf/core/dataframe.py | 25 +++--- python/cudf/cudf/core/single_column_frame.py | 4 +- python/cudf/cudf/tests/test_dataframe.py | 54 +++++++++++++ python/cudf/cudf/tests/test_stats.py | 83 +++++++++++++++++--- 4 files changed, 145 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 2b2c09fa2a0..ae60cd91fac 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5180,26 +5180,33 @@ def _reduce( if level is not None: raise NotImplementedError("level parameter is not implemented yet") - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" + source = self + if numeric_only: + numeric_cols = ( + name + for name in self._data.names + if is_numeric_dtype(self._data[name]) ) - axis = self._get_axis_from_axis_arg(axis) + source = self._get_columns_by_label(numeric_cols) + if source.empty: + return Series(index=cudf.StringIndex([])) + + axis = source._get_axis_from_axis_arg(axis) if axis == 0: try: result = [ - getattr(self._data[col], op)(**kwargs) - for col in self._data.names + getattr(source._data[col], op)(**kwargs) + for col in source._data.names ] except AttributeError: - raise TypeError(f"cannot perform {op} with type {self.dtype}") + raise TypeError(f"Not all column dtypes support op {op}") return Series._from_data( - {None: result}, as_index(self._data.names) + {None: result}, as_index(source._data.names) ) elif axis == 1: - return self._apply_cupy_method_axis_1(op, **kwargs) + return source._apply_cupy_method_axis_1(op, **kwargs) @_cudf_nvtx_annotate def _scan( diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 003f8ea7fdb..addc823e7f1 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -52,9 +52,9 @@ def _reduce( if level is not None: raise NotImplementedError("level parameter is not implemented yet") - if numeric_only not in (None, True): + if numeric_only: raise NotImplementedError( - "numeric_only parameter is not implemented yet" + f"Series.{op} does not implement numeric_only" ) try: return getattr(self._column, op)(**kwargs) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index a7fad792bd0..13ab0b35822 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9230,3 +9230,57 @@ def test_dataframe_pct_change(data, periods, fill_method): expected = pdf.pct_change(periods=periods, fill_method=fill_method) assert_eq(expected, actual) + + +def test_mean_timeseries(): + gdf = cudf.datasets.timeseries() + pdf = gdf.to_pandas() + + expected = pdf.mean(numeric_only=True) + actual = gdf.mean(numeric_only=True) + + assert_eq(expected, actual) + + with pytest.raises(TypeError): + gdf.mean() + + +@pytest.mark.parametrize( + "data", + [ + { + "a": [1, 2, 3, 4, 5], + "b": ["a", "b", "c", "d", "e"], + "c": [1.0, 2.0, 3.0, 4.0, 5.0], + } + ], +) +def test_std_different_dtypes(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + expected = pdf.std(numeric_only=True) + actual = gdf.std(numeric_only=True) + + assert_eq(expected, actual) + + with pytest.raises(TypeError): + gdf.std() + + +@pytest.mark.parametrize( + "data", + [ + { + "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"], + "val1": ["v", "n", "k", "l", "m", "i", "y", "r", "w"], + "val2": ["d", "d", "d", "e", "e", "e", "f", "f", "f"], + } + ], +) +def test_empty_numeric_only(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + expected = pdf.prod(numeric_only=True) + actual = gdf.prod(numeric_only=True) + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index 977a01952db..08f662f0ba7 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -239,13 +239,10 @@ def test_misc_quantiles(data, q): cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), cudf.Series([]), cudf.Series([-3]), - randomdata( - nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} - ), ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_kurtosis(data, null_flag): +def test_kurtosis_series(data, null_flag): pdata = data.to_pandas() if null_flag and len(data) > 2: @@ -262,8 +259,13 @@ def test_kurtosis(data, null_flag): expected = pdata.kurt() np.testing.assert_array_almost_equal(got, expected) + got = data.kurt(numeric_only=False) + got = got if np.isscalar(got) else got.to_numpy() + expected = pdata.kurt(numeric_only=False) + np.testing.assert_array_almost_equal(got, expected) + with pytest.raises(NotImplementedError): - data.kurt(numeric_only=False) + data.kurt(numeric_only=True) @pytest.mark.parametrize( @@ -280,13 +282,10 @@ def test_kurtosis(data, null_flag): cudf.Series([1.1032, 2.32, 43.4, 13, -312.0], index=[0, 4, 3, 19, 6]), cudf.Series([]), cudf.Series([-3]), - randomdata( - nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} - ), ], ) @pytest.mark.parametrize("null_flag", [False, True]) -def test_skew(data, null_flag): +def test_skew_series(data, null_flag): pdata = data.to_pandas() if null_flag and len(data) > 2: @@ -298,8 +297,13 @@ def test_skew(data, null_flag): got = got if np.isscalar(got) else got.to_numpy() np.testing.assert_array_almost_equal(got, expected) + got = data.skew(numeric_only=False) + expected = pdata.skew(numeric_only=False) + got = got if np.isscalar(got) else got.to_numpy() + np.testing.assert_array_almost_equal(got, expected) + with pytest.raises(NotImplementedError): - data.skew(numeric_only=False) + data.skew(numeric_only=True) @pytest.mark.parametrize("dtype", params_dtypes) @@ -541,3 +545,62 @@ def test_cov_corr_invalid_dtypes(gsr): rfunc_args_and_kwargs=([gsr],), compare_error_message=False, ) + + +@pytest.mark.parametrize( + "data", + [ + randomdata( + nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} + ), + ], +) +@pytest.mark.parametrize("null_flag", [False, True]) +def test_kurtosis_df(data, null_flag): + pdata = data.to_pandas() + + if null_flag and len(data) > 2: + data.iloc[[0, 2]] = None + pdata.iloc[[0, 2]] = None + + got = data.kurtosis() + got = got if np.isscalar(got) else got.to_numpy() + expected = pdata.kurtosis() + np.testing.assert_array_almost_equal(got, expected) + + got = data.kurt() + got = got if np.isscalar(got) else got.to_numpy() + expected = pdata.kurt() + np.testing.assert_array_almost_equal(got, expected) + + got = data.kurt(numeric_only=True) + got = got if np.isscalar(got) else got.to_numpy() + expected = pdata.kurt(numeric_only=True) + np.testing.assert_array_almost_equal(got, expected) + + +@pytest.mark.parametrize( + "data", + [ + randomdata( + nrows=1000, dtypes={"a": float, "b": int, "c": float, "d": str} + ), + ], +) +@pytest.mark.parametrize("null_flag", [False, True]) +def test_skew_df(data, null_flag): + pdata = data.to_pandas() + + if null_flag and len(data) > 2: + data.iloc[[0, 2]] = None + pdata.iloc[[0, 2]] = None + + got = data.skew() + expected = pdata.skew() + got = got if np.isscalar(got) else got.to_numpy() + np.testing.assert_array_almost_equal(got, expected) + + got = data.skew(numeric_only=True) + expected = pdata.skew(numeric_only=True) + got = got if np.isscalar(got) else got.to_numpy() + np.testing.assert_array_almost_equal(got, expected) From 77fa49eddf1c961277ec5e0fb3616433f2a46ea4 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 14 Apr 2022 14:13:06 -0700 Subject: [PATCH 2/7] Clean up C++ includes to use <> instead of "". (#10658) This PR cleans up some C++ includes to use `#include <...>` instead of `#include "..."` where appropriate. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/10658 --- cpp/benchmarks/io/orc/orc_writer.cpp | 2 +- cpp/benchmarks/sort/rank.cpp | 2 +- cpp/benchmarks/string/convert_durations.cpp | 15 +++++++-------- cpp/include/cudf/detail/reduction_functions.hpp | 2 +- cpp/libcudf_kafka/src/kafka_callback.cpp | 2 +- cpp/libcudf_kafka/src/kafka_consumer.cpp | 2 +- cpp/src/merge/merge.cu | 2 +- cpp/src/structs/structs_column_view.cpp | 4 ++-- .../binaryop/binop-compiled-fixed_point-test.cpp | 2 +- cpp/tests/hash_map/map_test.cu | 2 +- cpp/tests/iterator/value_iterator_test_strings.cu | 10 ++++++---- cpp/tests/partitioning/partition_test.cpp | 10 +++++----- 12 files changed, 28 insertions(+), 27 deletions(-) diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp index 525c13af5c0..f61dac7677b 100644 --- a/cpp/benchmarks/io/orc/orc_writer.cpp +++ b/cpp/benchmarks/io/orc/orc_writer.cpp @@ -14,7 +14,6 @@ * limitations under the License. */ -#include "cudf/io/types.hpp" #include #include @@ -23,6 +22,7 @@ #include #include +#include // to enable, run cmake with -DBUILD_BENCHMARKS=ON diff --git a/cpp/benchmarks/sort/rank.cpp b/cpp/benchmarks/sort/rank.cpp index 22acb241f0b..c3c77ebd52f 100644 --- a/cpp/benchmarks/sort/rank.cpp +++ b/cpp/benchmarks/sort/rank.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "cudf/column/column_view.hpp" +#include #include #include diff --git a/cpp/benchmarks/string/convert_durations.cpp b/cpp/benchmarks/string/convert_durations.cpp index dc9a1e991b2..8af111d9a63 100644 --- a/cpp/benchmarks/string/convert_durations.cpp +++ b/cpp/benchmarks/string/convert_durations.cpp @@ -13,25 +13,24 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - -#include - +#include #include #include +#include #include #include #include #include +#include + +#include +#include + #include #include -#include "../fixture/benchmark_fixture.hpp" -#include "../synchronization/synchronization.hpp" -#include "cudf/column/column_view.hpp" -#include "cudf/wrappers/durations.hpp" - class DurationsToString : public cudf::benchmark { }; template diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp index 3a6113e66ce..317e4d0cf47 100644 --- a/cpp/include/cudf/detail/reduction_functions.hpp +++ b/cpp/include/cudf/detail/reduction_functions.hpp @@ -17,9 +17,9 @@ #pragma once #include +#include #include -#include "cudf/lists/lists_column_view.hpp" #include namespace cudf { diff --git a/cpp/libcudf_kafka/src/kafka_callback.cpp b/cpp/libcudf_kafka/src/kafka_callback.cpp index 6b98747c145..79a40640627 100644 --- a/cpp/libcudf_kafka/src/kafka_callback.cpp +++ b/cpp/libcudf_kafka/src/kafka_callback.cpp @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "cudf_kafka/kafka_callback.hpp" +#include #include diff --git a/cpp/libcudf_kafka/src/kafka_consumer.cpp b/cpp/libcudf_kafka/src/kafka_consumer.cpp index 49e89a56e60..2ddaa9892da 100644 --- a/cpp/libcudf_kafka/src/kafka_consumer.cpp +++ b/cpp/libcudf_kafka/src/kafka_consumer.cpp @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "cudf_kafka/kafka_consumer.hpp" +#include #include diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu index 01a94457b69..9c94a6220d6 100644 --- a/cpp/src/merge/merge.cu +++ b/cpp/src/merge/merge.cu @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -38,7 +39,6 @@ #include #include -#include "cudf/utilities/traits.hpp" #include #include diff --git a/cpp/src/structs/structs_column_view.cpp b/cpp/src/structs/structs_column_view.cpp index db9496f18be..681f13386ff 100644 --- a/cpp/src/structs/structs_column_view.cpp +++ b/cpp/src/structs/structs_column_view.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,9 +14,9 @@ * limitations under the License. */ -#include "cudf/utilities/error.hpp" #include #include +#include namespace cudf { diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp index 64462669f90..28df893aff1 100644 --- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp +++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp @@ -20,13 +20,13 @@ #include #include #include +#include #include #include #include #include -#include "cudf/utilities/error.hpp" #include #include diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu index d69aee57756..f42549514e6 100644 --- a/cpp/tests/hash_map/map_test.cu +++ b/cpp/tests/hash_map/map_test.cu @@ -23,12 +23,12 @@ #include #include +#include #include #include #include -#include "rmm/exec_policy.hpp" #include #include #include diff --git a/cpp/tests/iterator/value_iterator_test_strings.cu b/cpp/tests/iterator/value_iterator_test_strings.cu index 5bddbfbd4aa..9aa18eb844f 100644 --- a/cpp/tests/iterator/value_iterator_test_strings.cu +++ b/cpp/tests/iterator/value_iterator_test_strings.cu @@ -12,10 +12,12 @@ * or implied. See the License for the specific language governing permissions and limitations under * the License. */ -#include "cudf/detail/utilities/vector_factories.hpp" -#include "rmm/cuda_stream_view.hpp" -#include "rmm/device_uvector.hpp" -#include +#include "iterator_tests.cuh" + +#include + +#include +#include #include #include diff --git a/cpp/tests/partitioning/partition_test.cpp b/cpp/tests/partitioning/partition_test.cpp index 785af409c4c..014a19e93a9 100644 --- a/cpp/tests/partitioning/partition_test.cpp +++ b/cpp/tests/partitioning/partition_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,16 +13,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include -#include #include #include #include #include #include -#include "cudf/sorting.hpp" +#include +#include +#include +#include template class PartitionTest : public cudf::test::BaseFixture { From 14a32619a5b1c0eff49588b141f8ef2eb754cadf Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 14 Apr 2022 14:40:20 -0700 Subject: [PATCH 3/7] Improve User Guide docs (#10663) This PR makes some minor improvements to the cuDF user guide and some docstrings. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/10663 --- docs/cudf/source/basics/basics.rst | 58 ++++++++++--------- docs/cudf/source/basics/internals.rst | 4 +- .../cudf/source/basics/io-gds-integration.rst | 24 ++++---- .../source/basics/io-nvcomp-integration.rst | 4 +- python/cudf/cudf/core/cut.py | 46 ++++++++++----- python/cudf/cudf/core/groupby/groupby.py | 21 +++---- python/cudf/cudf/core/single_column_frame.py | 4 +- 7 files changed, 91 insertions(+), 70 deletions(-) diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst index 60a65558033..9b8983fba49 100644 --- a/docs/cudf/source/basics/basics.rst +++ b/docs/cudf/source/basics/basics.rst @@ -15,36 +15,40 @@ The following table lists all of cudf types. For methods requiring dtype argumen .. rst-class:: special-table .. table:: - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Kind of Data | Data Type | Scalar | String Aliases | - +========================+==================+=====================================================================================+=============================================+ - | Integer | | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, np.uint16_, | ``'int8'``, ``'int16'``, ``'int32'``, | - | | | np.uint32_, np.uint64_ | ``'int64'``, ``'uint8'``, ``'uint16'``, | - | | | | ``'uint32'``, ``'uint64'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Float | | np.float32_, np.float64_ | ``'float32'``, ``'float64'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Strings | | `str `_ | ``'string'``, ``'object'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Datetime | | np.datetime64_ | ``'datetime64[s]'``, ``'datetime64[ms]'``, | - | | | | ``'datetime64[us]'``, ``'datetime64[ns]'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Timedelta | | np.timedelta64_ | ``'timedelta64[s]'``, ``'timedelta64[ms]'``,| - | (duration type) | | | ``'timedelta64[us]'``, ``'timedelta64[ns]'``| - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Categorical | CategoricalDtype | (none) | ``'category'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Boolean | | np.bool_ | ``'bool'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Decimal | Decimal32Dtype, | (none) | (none) | - | | Decimal64Dtype, | | | - | | Decimal128Dtype | | | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Kind of Data | Data Type | Scalar | String Aliases | + +=================+==================+==============================================================+==============================================+ + | Integer | | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, | ``'int8'``, ``'int16'``, ``'int32'``, | + | | | np.uint16_, np.uint32_, np.uint64_ | ``'int64'``, ``'uint8'``, ``'uint16'``, | + | | | | ``'uint32'``, ``'uint64'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Float | | np.float32_, np.float64_ | ``'float32'``, ``'float64'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Strings | | `str `_ | ``'string'``, ``'object'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Datetime | | np.datetime64_ | ``'datetime64[s]'``, ``'datetime64[ms]'``, | + | | | | ``'datetime64[us]'``, ``'datetime64[ns]'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Timedelta | | np.timedelta64_ | ``'timedelta64[s]'``, ``'timedelta64[ms]'``, | + | (duration type) | | | ``'timedelta64[us]'``, ``'timedelta64[ns]'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Categorical | CategoricalDtype | (none) | ``'category'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Boolean | | np.bool_ | ``'bool'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Decimal | Decimal32Dtype, | (none) | (none) | + | | Decimal64Dtype, | | | + | | Decimal128Dtype | | | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Lists | ListDtype | list | ``'list'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Structs | StructDtype | dict | ``'struct'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ **Note: All dtypes above are Nullable** -.. _np.int8: -.. _np.int16: +.. _np.int8: +.. _np.int16: .. _np.int32: .. _np.int64: .. _np.uint8: diff --git a/docs/cudf/source/basics/internals.rst b/docs/cudf/source/basics/internals.rst index 60b63c6fab8..96ef40d51e6 100644 --- a/docs/cudf/source/basics/internals.rst +++ b/docs/cudf/source/basics/internals.rst @@ -54,7 +54,7 @@ As another example, the ``StringColumn`` backing the Series 2. No mask buffer as there are no nulls in the Series 3. Two children columns: - - A column of 8-bit characters + - A column of UTF-8 characters ``['d', 'o', 'y', 'o', 'u', h' ... '?']`` - A column of "offsets" to the characters column (in this case, ``[0, 2, 5, 9, 12, 19]``) @@ -172,7 +172,7 @@ Selecting columns by index: >>> ca.select_by_index(1) ColumnAccessor(OrderedColumnDict([('y', )]), multiindex=False, level_names=(None,)) >>> ca.select_by_index([0, 1]) - ColumnAccessor(OrderedColumnDict([('x', ), ('y', )]), multiindex=False, level_names=(None,)) + ColumnAccessor(OrderedColumnDict([('x', ), ('y', )]), multiindex=False, level_names=(None,)) >>> ca.select_by_index(slice(1, 3)) ColumnAccessor(OrderedColumnDict([('y', ), ('z', )]), multiindex=False, level_names=(None,)) diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst index 71c114e9149..5ff07ac29c5 100644 --- a/docs/cudf/source/basics/io-gds-integration.rst +++ b/docs/cudf/source/basics/io-gds-integration.rst @@ -1,14 +1,14 @@ GPUDirect Storage Integration ============================= -Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations. -GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. -GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. +Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations. +GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. +GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. The SDK is available for download `here `_. GDS is also included in CUDA Toolkit 11.4 and higher. -Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``. -This variable also controls the GDS compatibility mode. +Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``. +This variable also controls the GDS compatibility mode. There are three valid values for the environment variable: @@ -20,17 +20,17 @@ If no value is set, behavior will be the same as the "GDS" option. This environment variable also affects how cuDF treats GDS errors. When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers. -When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), +When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), cuDF throws an exception to propagate the error to te user. Operations that support the use of GPUDirect Storage: -- `read_avro` -- `read_parquet` -- `read_orc` -- `to_csv` -- `to_parquet` -- `to_orc` +- :py:func:`cudf.read_avro` +- :py:func:`cudf.read_parquet` +- :py:func:`cudf.read_orc` +- :py:meth:`cudf.DataFrame.to_csv` +- :py:meth:`cudf.DataFrame.to_parquet` +- :py:meth:`cudf.DataFrame.to_orc` Several parameters that can be used to tune the performance of GDS-enabled I/O are exposed through environment variables: diff --git a/docs/cudf/source/basics/io-nvcomp-integration.rst b/docs/cudf/source/basics/io-nvcomp-integration.rst index 521833e2afd..fc24e0c15f4 100644 --- a/docs/cudf/source/basics/io-nvcomp-integration.rst +++ b/docs/cudf/source/basics/io-nvcomp-integration.rst @@ -1,14 +1,14 @@ nvCOMP Integration ============================= -Some types of compression/decompression can be performed using either `nvCOMP library `_ or the internal implementation. +Some types of compression/decompression can be performed using either the `nvCOMP library `_ or the internal implementation. Which implementation is used by default depends on the data format and the compression type. Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``. There are three valid values for the environment variable: -- "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use. +- "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use. - "ALWAYS": Enable all available uses of nvCOMP, including new, experimental combinations. - "OFF": Disable nvCOMP use whenever possible and use the internal implementations instead. diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index 7c585602c23..915383e4852 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + from collections.abc import Sequence import cupy @@ -21,21 +23,27 @@ def cut( duplicates: str = "raise", ordered: bool = True, ): + """Bin values into discrete intervals. - """ - Bin values into discrete intervals. Use cut when you need to segment and sort data values into bins. This function is also useful for going from a continuous variable to a categorical variable. + Parameters ---------- x : array-like The input array to be binned. Must be 1-dimensional. bins : int, sequence of scalars, or IntervalIndex The criteria to bin by. - * int : Defines the number of equal-width bins in the - range of x. The range of x is extended by .1% on each - side to include the minimum and maximum values of x. + + * int : Defines the number of equal-width bins in the range of `x`. The + range of `x` is extended by .1% on each side to include the minimum + and maximum values of `x`. + * sequence of scalars : Defines the bin edges allowing for non-uniform + width. No extension of the range of `x` is done. + * IntervalIndex : Defines the exact bins to be used. Note that + IntervalIndex for `bins` must be non-overlapping. + right : bool, default True Indicates whether bins includes the rightmost edge or not. labels : array or False, default None @@ -66,30 +74,38 @@ def cut( For scalar or sequence bins, this is an ndarray with the computed bins. If set duplicates=drop, bins will drop non-unique bin. For an IntervalIndex bins, this is equal to bins. + Examples -------- Discretize into three equal-sized bins. + >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3) CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], - ... (5.0, 7.0],(0.994, 3.0]], categories=[(0.994, 3.0], - ... (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category') + (5.0, 7.0], (0.994, 3.0]], categories=[(0.994, 3.0], + (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category') + >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) (CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], - ... (5.0, 7.0],(0.994, 3.0]],categories=[(0.994, 3.0], - ... (3.0, 5.0], (5.0, 7.0]],ordered=True, dtype='category'), - array([0.994, 3. , 5. , 7. ])) + (5.0, 7.0], (0.994, 3.0]], categories=[(0.994, 3.0], + (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category'), + array([0.994, 3. , 5. , 7. ])) + >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), - ... 3, labels=["bad", "medium", "good"]) + ... 3, labels=["bad", "medium", "good"]) CategoricalIndex(['bad', 'good', 'medium', 'medium', 'good', 'bad'], - ... categories=['bad', 'medium', 'good'],ordered=True, - ... dtype='category') + categories=['bad', 'medium', 'good'],ordered=True, + dtype='category') + >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, - ... labels=["B", "A", "B"], ordered=False) + ... labels=["B", "A", "B"], ordered=False) CategoricalIndex(['B', 'B', 'A', 'A', 'B', 'B'], categories=['A', 'B'], - ... ordered=False, dtype='category') + ordered=False, dtype='category') + >>> cudf.cut([0, 1, 1, 2], bins=4, labels=False) array([0, 1, 1, 3], dtype=int32) + Passing a Series as an input returns a Series with categorical dtype: + >>> s = cudf.Series(np.array([2, 4, 6, 8, 10]), ... index=['a', 'b', 'c', 'd', 'e']) >>> cudf.cut(s, 3) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6b98e82d553..40f8eda0e4f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -566,19 +566,20 @@ def mult(df): .. code-block:: >>> df = pd.DataFrame({ - 'a': [1, 1, 2, 2], - 'b': [1, 2, 1, 2], - 'c': [1, 2, 3, 4]}) + ... 'a': [1, 1, 2, 2], + ... 'b': [1, 2, 1, 2], + ... 'c': [1, 2, 3, 4], + ... }) >>> gdf = cudf.from_pandas(df) >>> df.groupby('a').apply(lambda x: x.iloc[[0]]) - a b c - a - 1 0 1 1 1 - 2 2 2 1 3 + a b c + a + 1 0 1 1 1 + 2 2 2 1 3 >>> gdf.groupby('a').apply(lambda x: x.iloc[[0]]) - a b c - 0 1 1 1 - 2 2 1 3 + a b c + 0 1 1 1 + 2 2 1 3 """ if not callable(function): raise TypeError(f"type {type(function)} is not callable") diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index addc823e7f1..7fa66bd831d 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -81,8 +81,8 @@ def name(self, value): @property # type: ignore @_cudf_nvtx_annotate - def ndim(self): - """Get the dimensionality (always 1 for single-columned frames).""" + def ndim(self): # noqa: D401 + """Number of dimensions of the underlying data, by definition 1.""" return 1 @property # type: ignore From 6e6c325e7cb99baeecaec65aff8c97aa2450ff51 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Thu, 14 Apr 2022 18:58:48 -0500 Subject: [PATCH 4/7] Fix some docstrings formatting (#10660) This PR fixes some of the broken docstring formattings in the code-base. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/10660 --- docs/cudf/source/api_docs/dataframe.rst | 3 +++ docs/cudf/source/api_docs/index_objects.rst | 2 ++ docs/cudf/source/api_docs/series.rst | 2 ++ docs/cudf/source/api_docs/string_handling.rst | 1 - docs/cudf/source/conf.py | 1 + python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/cut.py | 1 + python/cudf/cudf/core/indexed_frame.py | 2 ++ python/cudf/cudf/core/tools/numeric.py | 2 +- 9 files changed, 13 insertions(+), 3 deletions(-) diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst index 1d600acfef1..e0ef3cb2ff0 100644 --- a/docs/cudf/source/api_docs/dataframe.rst +++ b/docs/cudf/source/api_docs/dataframe.rst @@ -149,6 +149,7 @@ Computations / descriptive stats DataFrame.round DataFrame.skew DataFrame.sum + DataFrame.sum_of_squares DataFrame.std DataFrame.var DataFrame.nunique @@ -248,9 +249,11 @@ Serialization / IO / conversion DataFrame.to_dlpack DataFrame.to_parquet DataFrame.to_csv + DataFrame.to_cupy DataFrame.to_hdf DataFrame.to_dict DataFrame.to_json + DataFrame.to_numpy DataFrame.to_pandas DataFrame.to_feather DataFrame.to_records diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst index 6f5affd0ecd..8e0e3bbd411 100644 --- a/docs/cudf/source/api_docs/index_objects.rst +++ b/docs/cudf/source/api_docs/index_objects.rst @@ -92,7 +92,9 @@ Conversion Index.astype Index.to_arrow + Index.to_cupy Index.to_list + Index.to_numpy Index.to_series Index.to_frame Index.to_pandas diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index 95aa71919e4..d7015c9348d 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -390,10 +390,12 @@ Serialization / IO / conversion :toctree: api/ Series.to_arrow + Series.to_cupy Series.to_dlpack Series.to_frame Series.to_hdf Series.to_json + Series.to_numpy Series.to_pandas Series.to_string Series.from_arrow diff --git a/docs/cudf/source/api_docs/string_handling.rst b/docs/cudf/source/api_docs/string_handling.rst index 3087bcaa826..8d4646c47a7 100644 --- a/docs/cudf/source/api_docs/string_handling.rst +++ b/docs/cudf/source/api_docs/string_handling.rst @@ -83,7 +83,6 @@ strings and apply several methods to it. These can be accessed like rsplit startswith strip - subword_tokenize swapcase title token_count diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index dbdf8e59e6a..d65b77ef74b 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -252,6 +252,7 @@ def process_class_docstrings(app, what, name, obj, options, lines): lines[:] = lines[:cut_index] +nitpick_ignore = [("py:class", "SeriesOrIndex"),] def setup(app): diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 259a7f711c3..6fed6510484 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -118,7 +118,7 @@ def get_level_values(self, level): See Also -------- - cudf.core.multiindex.MultiIndex.get_level_values : Get values for + cudf.MultiIndex.get_level_values : Get values for a level of a MultiIndex. Notes diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index 915383e4852..0fef6630248 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -64,6 +64,7 @@ def cut( Categorical and Series (with Categorical dtype). If True, the resulting categorical will be ordered. If False, the resulting categorical will be unordered (labels must be provided). + Returns ------- out : CategoricalIndex diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 10736948b57..ea722ec3968 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -991,6 +991,7 @@ def add_prefix(self, prefix): Examples -------- **Series** + >>> s = cudf.Series([1, 2, 3, 4]) >>> s 0 1 @@ -1006,6 +1007,7 @@ def add_prefix(self, prefix): dtype: int64 **DataFrame** + >>> df = cudf.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]}) >>> df A B diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 7eea7cedaad..0273227010b 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -57,7 +57,7 @@ def to_numeric(arg, errors="raise", downcast=None): otherwise ndarray Notes - ------- + ----- An important difference from pandas is that this function does not accept mixed numeric/non-numeric type sequences. For example ``[1, 'a']``. A ``TypeError`` will be raised when such input is received, regardless of From 8f5a04451f8f61015d08c5699f0427b550afb53b Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 14 Apr 2022 17:24:37 -0700 Subject: [PATCH 5/7] Add option to drop cache in cuIO benchmarks (#10488) Dropping cache allows us to benchmark I/O times in a realistic/fair way. Cache is dropped before each iteration if `CUDF_BENCHMARK_DROP_CACHE` environment variable is set. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Mike Wilson (https://github.com/hyperbolic2346) - MithunR (https://github.com/mythrocks) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10488 --- cpp/benchmarks/io/csv/csv_reader.cpp | 2 ++ cpp/benchmarks/io/cuio_common.cpp | 28 ++++++++++++++++++++ cpp/benchmarks/io/cuio_common.hpp | 10 +++++++ cpp/benchmarks/io/orc/orc_reader.cpp | 2 ++ cpp/benchmarks/io/parquet/parquet_reader.cpp | 2 ++ cpp/benchmarks/io/text/multibyte_split.cpp | 1 + 6 files changed, 45 insertions(+) diff --git a/cpp/benchmarks/io/csv/csv_reader.cpp b/cpp/benchmarks/io/csv/csv_reader.cpp index c50f5220200..6f5e7160cd3 100644 --- a/cpp/benchmarks/io/csv/csv_reader.cpp +++ b/cpp/benchmarks/io/csv/csv_reader.cpp @@ -52,6 +52,7 @@ void BM_csv_read_varying_input(benchmark::State& state) auto mem_stats_logger = cudf::memory_stats_logger(); for (auto _ : state) { + try_drop_l3_cache(); cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 cudf_io::read_csv(read_options); } @@ -98,6 +99,7 @@ void BM_csv_read_varying_options(benchmark::State& state) cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks; auto mem_stats_logger = cudf::memory_stats_logger(); for (auto _ : state) { + try_drop_l3_cache(); cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 for (int32_t chunk = 0; chunk < num_chunks; ++chunk) { // only read the header in the first chunk diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index afe0cc77a4c..7d356263220 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -141,3 +141,31 @@ std::vector segments_in_chunk(int num_segments, int num_chunks, return selected_segments; } + +// Executes the command and returns stderr output +std::string exec_cmd(std::string_view cmd) +{ + // Switch stderr and stdout to only capture stderr + auto const redirected_cmd = std::string{"( "}.append(cmd).append(" 3>&2 2>&1 1>&3) 2>/dev/null"); + std::unique_ptr pipe(popen(redirected_cmd.c_str(), "r"), pclose); + CUDF_EXPECTS(pipe != nullptr, "popen() failed"); + + std::array buffer; + std::string error_out; + while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { + error_out += buffer.data(); + } + return error_out; +} + +void try_drop_l3_cache() +{ + static bool is_drop_cache_enabled = std::getenv("CUDF_BENCHMARK_DROP_CACHE") != nullptr; + if (not is_drop_cache_enabled) { return; } + + std::array drop_cache_cmds{"/sbin/sysctl vm.drop_caches=3", "sudo /sbin/sysctl vm.drop_caches=3"}; + CUDF_EXPECTS(std::any_of(drop_cache_cmds.cbegin(), + drop_cache_cmds.cend(), + [](auto& cmd) { return exec_cmd(cmd).empty(); }), + "Failed to execute the drop cache command"); +} diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp index 2ed534d5333..ff900d20e6f 100644 --- a/cpp/benchmarks/io/cuio_common.hpp +++ b/cpp/benchmarks/io/cuio_common.hpp @@ -132,3 +132,13 @@ std::vector select_column_names(std::vector const& col * The segments could be Parquet row groups or ORC stripes. */ std::vector segments_in_chunk(int num_segments, int num_chunks, int chunk); + +/** + * @brief Drops L3 cache if `CUDF_BENCHMARK_DROP_CACHE` environment variable is set. + * + * Has no effect if the environment variable is not set. + * May require sudo access ro run successfully. + * + * @throw cudf::logic_error if the environment variable is set and the command fails + */ +void try_drop_l3_cache(); diff --git a/cpp/benchmarks/io/orc/orc_reader.cpp b/cpp/benchmarks/io/orc/orc_reader.cpp index 0fc2238a272..fc76fbe7603 100644 --- a/cpp/benchmarks/io/orc/orc_reader.cpp +++ b/cpp/benchmarks/io/orc/orc_reader.cpp @@ -60,6 +60,7 @@ void BM_orc_read_varying_input(benchmark::State& state) auto mem_stats_logger = cudf::memory_stats_logger(); for (auto _ : state) { + try_drop_l3_cache(); cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 cudf_io::read_orc(read_opts); } @@ -117,6 +118,7 @@ void BM_orc_read_varying_options(benchmark::State& state) cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks; auto mem_stats_logger = cudf::memory_stats_logger(); for (auto _ : state) { + try_drop_l3_cache(); cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 cudf::size_type rows_read = 0; diff --git a/cpp/benchmarks/io/parquet/parquet_reader.cpp b/cpp/benchmarks/io/parquet/parquet_reader.cpp index 8a97fd35c31..b20534e8ac0 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader.cpp @@ -60,6 +60,7 @@ void BM_parq_read_varying_input(benchmark::State& state) auto mem_stats_logger = cudf::memory_stats_logger(); for (auto _ : state) { + try_drop_l3_cache(); cuda_event_timer const raii(state, true); // flush_l2_cache = true, stream = 0 cudf_io::read_parquet(read_opts); } @@ -117,6 +118,7 @@ void BM_parq_read_varying_options(benchmark::State& state) cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks; auto mem_stats_logger = cudf::memory_stats_logger(); for (auto _ : state) { + try_drop_l3_cache(); cuda_event_timer raii(state, true); // flush_l2_cache = true, stream = 0 cudf::size_type rows_read = 0; diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp index ada8856e8e5..af6c2c5e030 100644 --- a/cpp/benchmarks/io/text/multibyte_split.cpp +++ b/cpp/benchmarks/io/text/multibyte_split.cpp @@ -137,6 +137,7 @@ static void BM_multibyte_split(benchmark::State& state) auto mem_stats_logger = cudf::memory_stats_logger(); for (auto _ : state) { + try_drop_l3_cache(); cuda_event_timer raii(state, true); auto output = cudf::io::text::multibyte_split(*source, delim); } From b542678fda6ea40544d42e759caf3a6f8ad2b44d Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Fri, 15 Apr 2022 09:59:51 -0400 Subject: [PATCH 6/7] cuco isn't a cudf dependency when we are built shared (#10662) With the corrections in https://github.com/rapidsai/cudf/pull/10545 we didn't install the cuco headers / cmake files as they aren't needed for shared builds. But we forgot to remove the `find_package(cuco)` call from the generated cudf-config.cmake. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Paul Taylor (https://github.com/trxcllnt) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/10662 --- cpp/cmake/thirdparty/get_cucollections.cmake | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake index 1639655d1e9..5232821d113 100644 --- a/cpp/cmake/thirdparty/get_cucollections.cmake +++ b/cpp/cmake/thirdparty/get_cucollections.cmake @@ -21,12 +21,14 @@ function(find_and_configure_cucollections) cuco 0.0.1 GLOBAL_TARGETS cuco::cuco BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports CPM_ARGS GITHUB_REPOSITORY NVIDIA/cuCollections GIT_TAG fb58a38701f1c24ecfe07d8f1f208bbe80930da5 EXCLUDE_FROM_ALL ${BUILD_SHARED_LIBS} OPTIONS "BUILD_TESTS OFF" "BUILD_BENCHMARKS OFF" "BUILD_EXAMPLES OFF" ) + if(NOT BUILD_SHARED_LIBS) + rapids_export_package(INSTALL cuco cudf-exports) + endif() endfunction() From 4e668f27ba741ec1065b6ae6f99c0a4608df4336 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 15 Apr 2022 09:40:49 -0500 Subject: [PATCH 7/7] Update UDF notebook in User Guide. (#10668) I noticed a couple lines I didn't expect in the UDF notebook in the User Guide while working on #10663. I didn't get these changes into that PR (had to wait for a local build to verify some things). The two changes are: - We don't require `method="cudf"` in groupby statements. - We don't need to execute `from cudf.utils import cudautils` to run this notebook. (The cell execution counts also changed. There were some cells executed multiple times the last time this notebook was executed so they got out of order - this fixes it.) Authors: - Bradley Dice (https://github.com/bdice) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/10668 --- .../source/user_guide/guide-to-udfs.ipynb | 152 +++++++++--------- 1 file changed, 75 insertions(+), 77 deletions(-) diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb index 41bce8b865e..0d05ddb00b4 100644 --- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb +++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb @@ -138,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -148,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -160,7 +160,7 @@ "dtype: int64" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -193,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -205,7 +205,7 @@ "dtype: int64" ] }, - "execution_count": 9, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -218,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -229,7 +229,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -241,7 +241,7 @@ "dtype: int64" ] }, - "execution_count": 11, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -260,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -274,7 +274,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -286,7 +286,7 @@ "dtype: int64" ] }, - "execution_count": 14, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -322,7 +322,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -331,7 +331,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -355,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -452,7 +452,7 @@ "4 979 982 1011 9790.0" ] }, - "execution_count": 19, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -497,7 +497,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -514,7 +514,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -569,7 +569,7 @@ "2 3 6" ] }, - "execution_count": 21, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -591,7 +591,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -603,7 +603,7 @@ "dtype: int64" ] }, - "execution_count": 22, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -621,7 +621,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -633,7 +633,7 @@ "dtype: object" ] }, - "execution_count": 23, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -658,7 +658,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -709,7 +709,7 @@ "2 3" ] }, - "execution_count": 24, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -728,7 +728,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -740,7 +740,7 @@ "dtype: int64" ] }, - "execution_count": 25, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -758,7 +758,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -813,7 +813,7 @@ "2 3 1" ] }, - "execution_count": 26, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -836,7 +836,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -848,7 +848,7 @@ "dtype: int64" ] }, - "execution_count": 27, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -866,7 +866,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -921,7 +921,7 @@ "2 3 3.14" ] }, - "execution_count": 28, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -939,7 +939,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -951,7 +951,7 @@ "dtype: float64" ] }, - "execution_count": 29, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -982,7 +982,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -1033,7 +1033,7 @@ "2 5" ] }, - "execution_count": 30, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1054,7 +1054,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1066,7 +1066,7 @@ "dtype: float64" ] }, - "execution_count": 31, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1084,7 +1084,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1151,7 +1151,7 @@ "2 3 6 4 8 6" ] }, - "execution_count": 32, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1172,7 +1172,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1184,7 +1184,7 @@ "dtype: float64" ] }, - "execution_count": 33, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1212,7 +1212,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -1241,7 +1241,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1312,7 +1312,7 @@ "2 3 6 4 8 6 9.0" ] }, - "execution_count": 35, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1344,7 +1344,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -1417,7 +1417,7 @@ "4 979 982 1011" ] }, - "execution_count": 36, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1443,7 +1443,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -1522,7 +1522,7 @@ "4 979 982 1011 1961.0" ] }, - "execution_count": 37, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -1555,7 +1555,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -1570,7 +1570,7 @@ "dtype: float64" ] }, - "execution_count": 38, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -1582,7 +1582,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -1591,7 +1591,7 @@ "Rolling [window=3,min_periods=3,center=False]" ] }, - "execution_count": 39, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -1610,7 +1610,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -1634,7 +1634,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -1649,7 +1649,7 @@ "dtype: float64" ] }, - "execution_count": 41, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -1667,7 +1667,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -1734,7 +1734,7 @@ "4 59.0 59.0" ] }, - "execution_count": 42, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -1748,7 +1748,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -1845,7 +1845,7 @@ "9 100.0 100.0" ] }, - "execution_count": 43, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -1863,12 +1863,12 @@ "\n", "We can also apply UDFs to grouped DataFrames using `apply_grouped`. This example is also drawn and adapted from the RAPIDS [API documentation]().\n", "\n", - "First, we'll group our DataFrame based on column `b`, which is either True or False. Note that we currently need to pass `method=\"cudf\"` to use UDFs with GroupBy objects." + "First, we'll group our DataFrame based on column `b`, which is either True or False." ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -1947,7 +1947,7 @@ "4 -0.970850 False Sarah 0.342905" ] }, - "execution_count": 44, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -1959,7 +1959,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -1975,7 +1975,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -2002,7 +2002,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -2132,7 +2132,7 @@ "9 -0.725581 True George 0.405245 0.271319" ] }, - "execution_count": 47, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -2162,7 +2162,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -2171,7 +2171,7 @@ "array([ 1., 2., 3., 4., 10.])" ] }, - "execution_count": 48, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } @@ -2193,7 +2193,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -2207,14 +2207,12 @@ "dtype: int32" ] }, - "execution_count": 49, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from cudf.utils import cudautils\n", - "\n", "@cuda.jit\n", "def multiply_by_5(x, out):\n", " i = cuda.grid(1)\n", @@ -2235,7 +2233,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 46, "metadata": {}, "outputs": [ { @@ -2244,7 +2242,7 @@ "array([ 5., 10., 15., 20., 50.])" ] }, - "execution_count": 50, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -2307,7 +2305,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.12" } }, "nbformat": 4,