From e3ea261f0ba04b510b8d1e9c6683cf7f8bd4262c Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 14 Apr 2022 11:57:39 -0700 Subject: [PATCH 1/9] Improve cudf.cut docstrings. --- python/cudf/cudf/core/cut.py | 44 ++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index 7c585602c23..b3fa9b179fc 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -21,21 +21,27 @@ def cut( duplicates: str = "raise", ordered: bool = True, ): + """Bin values into discrete intervals. - """ - Bin values into discrete intervals. Use cut when you need to segment and sort data values into bins. This function is also useful for going from a continuous variable to a categorical variable. + Parameters ---------- x : array-like The input array to be binned. Must be 1-dimensional. bins : int, sequence of scalars, or IntervalIndex The criteria to bin by. - * int : Defines the number of equal-width bins in the - range of x. The range of x is extended by .1% on each - side to include the minimum and maximum values of x. + + * int : Defines the number of equal-width bins in the range of `x`. The + range of `x` is extended by .1% on each side to include the minimum + and maximum values of `x`. + * sequence of scalars : Defines the bin edges allowing for non-uniform + width. No extension of the range of `x` is done. + * IntervalIndex : Defines the exact bins to be used. Note that + IntervalIndex for `bins` must be non-overlapping. + right : bool, default True Indicates whether bins includes the rightmost edge or not. labels : array or False, default None @@ -66,30 +72,38 @@ def cut( For scalar or sequence bins, this is an ndarray with the computed bins. If set duplicates=drop, bins will drop non-unique bin. For an IntervalIndex bins, this is equal to bins. + Examples -------- Discretize into three equal-sized bins. + >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3) CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], - ... (5.0, 7.0],(0.994, 3.0]], categories=[(0.994, 3.0], - ... (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category') + (5.0, 7.0], (0.994, 3.0]], categories=[(0.994, 3.0], + (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category') + >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) (CategoricalIndex([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], - ... (5.0, 7.0],(0.994, 3.0]],categories=[(0.994, 3.0], - ... (3.0, 5.0], (5.0, 7.0]],ordered=True, dtype='category'), - array([0.994, 3. , 5. , 7. ])) + (5.0, 7.0], (0.994, 3.0]], categories=[(0.994, 3.0], + (3.0, 5.0], (5.0, 7.0]], ordered=True, dtype='category'), + array([0.994, 3. , 5. , 7. ])) + >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), - ... 3, labels=["bad", "medium", "good"]) + ... 3, labels=["bad", "medium", "good"]) CategoricalIndex(['bad', 'good', 'medium', 'medium', 'good', 'bad'], - ... categories=['bad', 'medium', 'good'],ordered=True, - ... dtype='category') + categories=['bad', 'medium', 'good'],ordered=True, + dtype='category') + >>> cudf.cut(np.array([1, 7, 5, 4, 6, 3]), 3, - ... labels=["B", "A", "B"], ordered=False) + ... labels=["B", "A", "B"], ordered=False) CategoricalIndex(['B', 'B', 'A', 'A', 'B', 'B'], categories=['A', 'B'], - ... ordered=False, dtype='category') + ordered=False, dtype='category') + >>> cudf.cut([0, 1, 1, 2], bins=4, labels=False) array([0, 1, 1, 3], dtype=int32) + Passing a Series as an input returns a Series with categorical dtype: + >>> s = cudf.Series(np.array([2, 4, 6, 8, 10]), ... index=['a', 'b', 'c', 'd', 'e']) >>> cudf.cut(s, 3) From f72a03fc079f90a49154155479efada432cda53b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 14 Apr 2022 11:58:08 -0700 Subject: [PATCH 2/9] Rewrite docstring for SingleColumnFrame.ndim. --- python/cudf/cudf/core/single_column_frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 003f8ea7fdb..d104585a020 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -82,7 +82,7 @@ def name(self, value): @property # type: ignore @_cudf_nvtx_annotate def ndim(self): - """Get the dimensionality (always 1 for single-columned frames).""" + """Number of dimensions of the underlying data, by definition 1.""" return 1 @property # type: ignore From cf9ad4860d130d2d5bb10abfd39e216da05cc42e Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 14 Apr 2022 11:58:47 -0700 Subject: [PATCH 3/9] Improve formatting of pandas compatibility notice for groupby.apply. --- python/cudf/cudf/core/groupby/groupby.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6b98e82d553..b5a61598d0d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -566,19 +566,20 @@ def mult(df): .. code-block:: >>> df = pd.DataFrame({ - 'a': [1, 1, 2, 2], - 'b': [1, 2, 1, 2], - 'c': [1, 2, 3, 4]}) + ... 'a': [1, 1, 2, 2], + ... 'b': [1, 2, 1, 2], + ... 'c': [1, 2, 3, 4], + ... }) >>> gdf = cudf.from_pandas(df) >>> df.groupby('a').apply(lambda x: x.iloc[[0]]) - a b c - a - 1 0 1 1 1 - 2 2 2 1 3 + a b c │········· + a │········· + 1 0 1 1 1 │········· + 2 2 2 1 3 >>> gdf.groupby('a').apply(lambda x: x.iloc[[0]]) - a b c - 0 1 1 1 - 2 2 1 3 + a b c │········· + 0 1 1 1 │········· + 2 2 1 3 """ if not callable(function): raise TypeError(f"type {type(function)} is not callable") From 5da19303d0f7f8ca58f259d290c2be310704ba8c Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 14 Apr 2022 11:59:09 -0700 Subject: [PATCH 4/9] Add list, struct to basic type information table. --- docs/cudf/source/basics/basics.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst index 60a65558033..bb703b54e7b 100644 --- a/docs/cudf/source/basics/basics.rst +++ b/docs/cudf/source/basics/basics.rst @@ -40,11 +40,15 @@ The following table lists all of cudf types. For methods requiring dtype argumen | | Decimal64Dtype, | | | | | Decimal128Dtype | | | +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Lists | | list | ``'list'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + | Structs | | dict | ``'struct'`` | + +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ **Note: All dtypes above are Nullable** -.. _np.int8: -.. _np.int16: +.. _np.int8: +.. _np.int16: .. _np.int32: .. _np.int64: .. _np.uint8: From d015e99e22b63c90345a19e439ad9f3a37359bcd Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 14 Apr 2022 11:59:20 -0700 Subject: [PATCH 5/9] Specify UTF-8 encoding. --- docs/cudf/source/basics/internals.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/cudf/source/basics/internals.rst b/docs/cudf/source/basics/internals.rst index 60b63c6fab8..96ef40d51e6 100644 --- a/docs/cudf/source/basics/internals.rst +++ b/docs/cudf/source/basics/internals.rst @@ -54,7 +54,7 @@ As another example, the ``StringColumn`` backing the Series 2. No mask buffer as there are no nulls in the Series 3. Two children columns: - - A column of 8-bit characters + - A column of UTF-8 characters ``['d', 'o', 'y', 'o', 'u', h' ... '?']`` - A column of "offsets" to the characters column (in this case, ``[0, 2, 5, 9, 12, 19]``) @@ -172,7 +172,7 @@ Selecting columns by index: >>> ca.select_by_index(1) ColumnAccessor(OrderedColumnDict([('y', )]), multiindex=False, level_names=(None,)) >>> ca.select_by_index([0, 1]) - ColumnAccessor(OrderedColumnDict([('x', ), ('y', )]), multiindex=False, level_names=(None,)) + ColumnAccessor(OrderedColumnDict([('x', ), ('y', )]), multiindex=False, level_names=(None,)) >>> ca.select_by_index(slice(1, 3)) ColumnAccessor(OrderedColumnDict([('y', ), ('z', )]), multiindex=False, level_names=(None,)) From 200e6a69f12a7a2209e57151e4518ea4583ec8f6 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 14 Apr 2022 11:59:42 -0700 Subject: [PATCH 6/9] Add links to GDS supported methods. --- .../cudf/source/basics/io-gds-integration.rst | 24 +++++++++---------- .../source/basics/io-nvcomp-integration.rst | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/docs/cudf/source/basics/io-gds-integration.rst b/docs/cudf/source/basics/io-gds-integration.rst index 71c114e9149..5ff07ac29c5 100644 --- a/docs/cudf/source/basics/io-gds-integration.rst +++ b/docs/cudf/source/basics/io-gds-integration.rst @@ -1,14 +1,14 @@ GPUDirect Storage Integration ============================= -Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations. -GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. -GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. +Many IO APIs can use GPUDirect Storage (GDS) library to optimize IO operations. +GDS enables a direct data path for direct memory access (DMA) transfers between GPU memory and storage, which avoids a bounce buffer through the CPU. +GDS also has a compatibility mode that allows the library to fall back to copying through a CPU bounce buffer. The SDK is available for download `here `_. GDS is also included in CUDA Toolkit 11.4 and higher. -Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``. -This variable also controls the GDS compatibility mode. +Use of GPUDirect Storage in cuDF is enabled by default, but can be disabled through the environment variable ``LIBCUDF_CUFILE_POLICY``. +This variable also controls the GDS compatibility mode. There are three valid values for the environment variable: @@ -20,17 +20,17 @@ If no value is set, behavior will be the same as the "GDS" option. This environment variable also affects how cuDF treats GDS errors. When ``LIBCUDF_CUFILE_POLICY`` is set to "GDS" and a GDS API call fails for any reason, cuDF falls back to the internal implementation with bounce buffers. -When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), +When ``LIBCUDF_CUFILE_POLICY`` is set to "ALWAYS" and a GDS API call fails for any reason (unlikely, given that the compatibility mode is on), cuDF throws an exception to propagate the error to te user. Operations that support the use of GPUDirect Storage: -- `read_avro` -- `read_parquet` -- `read_orc` -- `to_csv` -- `to_parquet` -- `to_orc` +- :py:func:`cudf.read_avro` +- :py:func:`cudf.read_parquet` +- :py:func:`cudf.read_orc` +- :py:meth:`cudf.DataFrame.to_csv` +- :py:meth:`cudf.DataFrame.to_parquet` +- :py:meth:`cudf.DataFrame.to_orc` Several parameters that can be used to tune the performance of GDS-enabled I/O are exposed through environment variables: diff --git a/docs/cudf/source/basics/io-nvcomp-integration.rst b/docs/cudf/source/basics/io-nvcomp-integration.rst index 521833e2afd..fc24e0c15f4 100644 --- a/docs/cudf/source/basics/io-nvcomp-integration.rst +++ b/docs/cudf/source/basics/io-nvcomp-integration.rst @@ -1,14 +1,14 @@ nvCOMP Integration ============================= -Some types of compression/decompression can be performed using either `nvCOMP library `_ or the internal implementation. +Some types of compression/decompression can be performed using either the `nvCOMP library `_ or the internal implementation. Which implementation is used by default depends on the data format and the compression type. Behavior can be influenced through environment variable ``LIBCUDF_NVCOMP_POLICY``. There are three valid values for the environment variable: -- "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use. +- "STABLE": Only enable the nvCOMP in places where it has been deemed stable for production use. - "ALWAYS": Enable all available uses of nvCOMP, including new, experimental combinations. - "OFF": Disable nvCOMP use whenever possible and use the internal implementations instead. From 96840457c7d2a95c78fc0ee2c56f7fcc6a797275 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 14 Apr 2022 14:57:52 -0500 Subject: [PATCH 7/9] Apply suggestions from code review Co-authored-by: GALI PREM SAGAR --- docs/cudf/source/basics/basics.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst index bb703b54e7b..e6f42b04c99 100644 --- a/docs/cudf/source/basics/basics.rst +++ b/docs/cudf/source/basics/basics.rst @@ -40,9 +40,9 @@ The following table lists all of cudf types. For methods requiring dtype argumen | | Decimal64Dtype, | | | | | Decimal128Dtype | | | +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Lists | | list | ``'list'`` | + | Lists | ListDtype | list | ``'list'`` | +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Structs | | dict | ``'struct'`` | + | Structs | StructDtype | dict | ``'struct'`` | +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ **Note: All dtypes above are Nullable** From 3ad234c825d26c453b67c228206268ca0f2ee2f3 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 14 Apr 2022 13:02:07 -0700 Subject: [PATCH 8/9] Reformat table. --- docs/cudf/source/basics/basics.rst | 58 +++++++++++++++--------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/docs/cudf/source/basics/basics.rst b/docs/cudf/source/basics/basics.rst index e6f42b04c99..9b8983fba49 100644 --- a/docs/cudf/source/basics/basics.rst +++ b/docs/cudf/source/basics/basics.rst @@ -15,35 +15,35 @@ The following table lists all of cudf types. For methods requiring dtype argumen .. rst-class:: special-table .. table:: - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Kind of Data | Data Type | Scalar | String Aliases | - +========================+==================+=====================================================================================+=============================================+ - | Integer | | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, np.uint16_, | ``'int8'``, ``'int16'``, ``'int32'``, | - | | | np.uint32_, np.uint64_ | ``'int64'``, ``'uint8'``, ``'uint16'``, | - | | | | ``'uint32'``, ``'uint64'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Float | | np.float32_, np.float64_ | ``'float32'``, ``'float64'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Strings | | `str `_ | ``'string'``, ``'object'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Datetime | | np.datetime64_ | ``'datetime64[s]'``, ``'datetime64[ms]'``, | - | | | | ``'datetime64[us]'``, ``'datetime64[ns]'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Timedelta | | np.timedelta64_ | ``'timedelta64[s]'``, ``'timedelta64[ms]'``,| - | (duration type) | | | ``'timedelta64[us]'``, ``'timedelta64[ns]'``| - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Categorical | CategoricalDtype | (none) | ``'category'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Boolean | | np.bool_ | ``'bool'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Decimal | Decimal32Dtype, | (none) | (none) | - | | Decimal64Dtype, | | | - | | Decimal128Dtype | | | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Lists | ListDtype | list | ``'list'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ - | Structs | StructDtype | dict | ``'struct'`` | - +------------------------+------------------+-------------------------------------------------------------------------------------+---------------------------------------------+ + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Kind of Data | Data Type | Scalar | String Aliases | + +=================+==================+==============================================================+==============================================+ + | Integer | | np.int8_, np.int16_, np.int32_, np.int64_, np.uint8_, | ``'int8'``, ``'int16'``, ``'int32'``, | + | | | np.uint16_, np.uint32_, np.uint64_ | ``'int64'``, ``'uint8'``, ``'uint16'``, | + | | | | ``'uint32'``, ``'uint64'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Float | | np.float32_, np.float64_ | ``'float32'``, ``'float64'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Strings | | `str `_ | ``'string'``, ``'object'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Datetime | | np.datetime64_ | ``'datetime64[s]'``, ``'datetime64[ms]'``, | + | | | | ``'datetime64[us]'``, ``'datetime64[ns]'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Timedelta | | np.timedelta64_ | ``'timedelta64[s]'``, ``'timedelta64[ms]'``, | + | (duration type) | | | ``'timedelta64[us]'``, ``'timedelta64[ns]'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Categorical | CategoricalDtype | (none) | ``'category'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Boolean | | np.bool_ | ``'bool'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Decimal | Decimal32Dtype, | (none) | (none) | + | | Decimal64Dtype, | | | + | | Decimal128Dtype | | | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Lists | ListDtype | list | ``'list'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ + | Structs | StructDtype | dict | ``'struct'`` | + +-----------------+------------------+--------------------------------------------------------------+----------------------------------------------+ **Note: All dtypes above are Nullable** From cce4cc888bd85ddd0e50e562017f8a33723c2cdc Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 14 Apr 2022 15:21:59 -0500 Subject: [PATCH 9/9] Fix style and copyrights. --- python/cudf/cudf/core/cut.py | 2 ++ python/cudf/cudf/core/groupby/groupby.py | 10 +++++----- python/cudf/cudf/core/single_column_frame.py | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index b3fa9b179fc..915383e4852 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -1,3 +1,5 @@ +# Copyright (c) 2021-2022, NVIDIA CORPORATION. + from collections.abc import Sequence import cupy diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index b5a61598d0d..40f8eda0e4f 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -572,13 +572,13 @@ def mult(df): ... }) >>> gdf = cudf.from_pandas(df) >>> df.groupby('a').apply(lambda x: x.iloc[[0]]) - a b c │········· - a │········· - 1 0 1 1 1 │········· + a b c + a + 1 0 1 1 1 2 2 2 1 3 >>> gdf.groupby('a').apply(lambda x: x.iloc[[0]]) - a b c │········· - 0 1 1 1 │········· + a b c + 0 1 1 1 2 2 1 3 """ if not callable(function): diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index d104585a020..553fab47f33 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -81,7 +81,7 @@ def name(self, value): @property # type: ignore @_cudf_nvtx_annotate - def ndim(self): + def ndim(self): # noqa: D401 """Number of dimensions of the underlying data, by definition 1.""" return 1