From c11b9a4ea2fa72dc0868830dab337ae6851284a8 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 29 Oct 2021 15:01:18 -0700 Subject: [PATCH 01/58] Add doctests script. --- python/cudf/cudf/tests/test_doctests.py | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 python/cudf/cudf/tests/test_doctests.py diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py new file mode 100644 index 00000000000..f01a79a0d53 --- /dev/null +++ b/python/cudf/cudf/tests/test_doctests.py @@ -0,0 +1,28 @@ +import doctest +import inspect + +import pytest + +import cudf + + +def fetch_doctests(): + finder = doctest.DocTestFinder() + for name, member in inspect.getmembers(cudf): + if inspect.ismodule(member): + for docstring in finder.find(member): + if docstring.examples: + yield docstring + + +class TestDoctests: + @pytest.mark.parametrize( + "docstring", fetch_doctests(), ids=lambda docstring: docstring.name + ) + def test_docstring(self, docstring): + optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE + runner = doctest.DocTestRunner(optionflags=optionflags) + runner.run(docstring) + results = runner.summarize() + if results.failed: + raise AssertionError(results) From 5e88c6755b06ee85a99a3e82cd7c65bce9060feb Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 1 Nov 2021 20:46:01 -0500 Subject: [PATCH 02/58] Intermediate progress. --- python/cudf/cudf/__init__.py | 83 +++++++++++++++++++++ python/cudf/cudf/api/__init__.py | 4 +- python/cudf/cudf/api/extensions/__init__.py | 6 ++ python/cudf/cudf/api/extensions/accessor.py | 7 ++ python/cudf/cudf/tests/test_doctests.py | 30 ++++++-- 5 files changed, 124 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index df09a72ce25..8155d067ebb 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -114,3 +114,86 @@ __version__ = get_versions()["version"] del get_versions + +__all__ = [ + "dtype", + "api", + "core", + "datasets", + "testing", + "NA", + "Scalar", + "BaseIndex", + "CategoricalIndex", + "DatetimeIndex", + "Float32Index", + "Float64Index", + "Index", + "GenericIndex", + "Int8Index", + "Int16Index", + "Int32Index", + "Int64Index", + "IntervalIndex", + "RangeIndex", + "StringIndex", + "TimedeltaIndex", + "UInt8Index", + "UInt16Index", + "UInt32Index", + "UInt64Index", + "interval_range", + "DataFrame", + "from_pandas", + "merge", + "Series", + "MultiIndex", + "cut", + "factorize", + "CategoricalDtype", + "Decimal64Dtype", + "Decimal32Dtype", + "IntervalDtype", + "ListDtype", + "StructDtype", + "Grouper", + "add", + "arccos", + "arcsin", + "arctan", + "cos", + "exp", + "floor_divide", + "log", + "logical_and", + "logical_not", + "logical_or", + "multiply", + "remainder", + "sin", + "sqrt", + "subtract", + "tan", + "true_divide", + "concat", + "get_dummies", + "melt", + "merge_sorted", + "pivot", + "unstack", + "isclose", + "DateOffset", + "to_datetime", + "to_numeric", + "from_dlpack", + "read_avro", + "read_csv", + "read_feather", + "read_hdf", + "read_json", + "read_orc", + "read_parquet", + "read_text", + "set_allocator", + "__version__", +] diff --git a/python/cudf/cudf/api/__init__.py b/python/cudf/cudf/api/__init__.py index 21c24015e41..c66bfb4efeb 100644 --- a/python/cudf/cudf/api/__init__.py +++ b/python/cudf/cudf/api/__init__.py @@ -1,3 +1,5 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -from cudf.api import types +from cudf.api import extensions, types + +__all__ = ["extensions", "types"] diff --git a/python/cudf/cudf/api/extensions/__init__.py b/python/cudf/cudf/api/extensions/__init__.py index c971e6f7731..eeb5dcdb32a 100644 --- a/python/cudf/cudf/api/extensions/__init__.py +++ b/python/cudf/cudf/api/extensions/__init__.py @@ -5,3 +5,9 @@ register_index_accessor, register_series_accessor, ) + +__all__ = [ + "register_dataframe_accessor", + "register_index_accessor", + "register_series_accessor", +] diff --git a/python/cudf/cudf/api/extensions/accessor.py b/python/cudf/cudf/api/extensions/accessor.py index a27ffa90cfc..524c11f048d 100644 --- a/python/cudf/cudf/api/extensions/accessor.py +++ b/python/cudf/cudf/api/extensions/accessor.py @@ -159,3 +159,10 @@ def register_index_accessor(name): def register_series_accessor(name): """{docstring}""" return _register_accessor(name, cudf.Series) + + +__all__ = [ + "register_dataframe_accessor", + "register_index_accessor", + "register_series_accessor", +] diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index f01a79a0d53..c1cf62de007 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -5,14 +5,34 @@ import cudf +# These classes and all subclasses will be doctested +doctested_classes = [ + "Frame", + "BaseIndex", +] + + +def find_docstrings_in_module(finder, module): + print("Finding in module", module.__name__) + for docstring in finder.find(module): + print("Finding in docstring", docstring.name, docstring.filename) + if docstring.examples: + yield docstring + for name, member in inspect.getmembers(module): + if name not in getattr(module, "__all__", []): + if inspect.ismodule(member): + print("SKIPPING MODULE", module.__name__, name) + else: + print("Skipping member", module.__name__, name) + continue + # print("Investigating", name) + if inspect.ismodule(member): + yield from find_docstrings_in_module(finder, member) + def fetch_doctests(): finder = doctest.DocTestFinder() - for name, member in inspect.getmembers(cudf): - if inspect.ismodule(member): - for docstring in finder.find(member): - if docstring.examples: - yield docstring + yield from find_docstrings_in_module(finder, cudf) class TestDoctests: From bb37a38064ec58acc4d8eeb9a34b93acccc9118e Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 14:39:14 -0600 Subject: [PATCH 03/58] Update __all__ in cudf/__init__.py. --- python/cudf/cudf/__init__.py | 81 ++++++++++++++---------------------- 1 file changed, 32 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index b528c2e16ad..961438e22bc 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -56,6 +56,8 @@ StructDtype, ) from cudf.core.groupby import Grouper + +# TODO: Math operations like add, arccos, etc. are not exposed in pandas' root namespace. from cudf.core.ops import ( add, arccos, @@ -84,6 +86,8 @@ pivot, unstack, ) + +# TODO: Pandas does not expose isclose in the root namespace. from cudf.core.series import isclose from cudf.core.tools.datetimes import DateOffset, to_datetime from cudf.core.tools.numeric import to_numeric @@ -128,76 +132,52 @@ del get_versions __all__ = [ - "dtype", - "api", - "core", - "datasets", - "testing", - "NA", - "Scalar", "BaseIndex", + "CategoricalDtype", "CategoricalIndex", + "DataFrame", + "DateOffset", "DatetimeIndex", + "Decimal32Dtype", + "Decimal64Dtype", "Float32Index", "Float64Index", - "Index", "GenericIndex", - "Int8Index", + "Grouper", + "Index", "Int16Index", "Int32Index", "Int64Index", + "Int8Index", + "IntervalDtype", "IntervalIndex", + "ListDtype", + "MultiIndex", + "NA", "RangeIndex", + "Scalar", + "Series", "StringIndex", + "StructDtype", "TimedeltaIndex", - "UInt8Index", "UInt16Index", "UInt32Index", "UInt64Index", - "interval_range", - "DataFrame", - "from_pandas", - "merge", - "Series", - "MultiIndex", + "UInt8Index", + "api", + "concat", "cut", + "date_range", "factorize", - "CategoricalDtype", - "Decimal64Dtype", - "Decimal32Dtype", - "IntervalDtype", - "ListDtype", - "StructDtype", - "Grouper", - "add", - "arccos", - "arcsin", - "arctan", - "cos", - "exp", - "floor_divide", - "log", - "logical_and", - "logical_not", - "logical_or", - "multiply", - "remainder", - "sin", - "sqrt", - "subtract", - "tan", - "true_divide", - "concat", + "from_dataframe", + "from_dlpack", + "from_pandas", "get_dummies", + "interval_range", "melt", + "merge", "merge_sorted", "pivot", - "unstack", - "isclose", - "DateOffset", - "to_datetime", - "to_numeric", - "from_dlpack", "read_avro", "read_csv", "read_feather", @@ -207,5 +187,8 @@ "read_parquet", "read_text", "set_allocator", - "__version__", + "testing", + "to_datetime", + "to_numeric", + "unstack", ] From e4330af46865043a00d011c05964e7d60c87792b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 14:46:07 -0600 Subject: [PATCH 04/58] Fix recursion logic for modules and classes. --- python/cudf/cudf/tests/test_doctests.py | 41 ++++++++++++++----------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index c1cf62de007..f840f44e5f6 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -5,34 +5,39 @@ import cudf -# These classes and all subclasses will be doctested -doctested_classes = [ - "Frame", - "BaseIndex", -] +def _name_in_all(parent, name, member): + return name in getattr(parent, "__all__", []) -def find_docstrings_in_module(finder, module): - print("Finding in module", module.__name__) - for docstring in finder.find(module): - print("Finding in docstring", docstring.name, docstring.filename) + +def _is_public_name(parent, name, member): + return not name.startswith("_") + + +def find_docstrings_in_obj(finder, obj, criteria=None): + for docstring in finder.find(obj): if docstring.examples: yield docstring - for name, member in inspect.getmembers(module): - if name not in getattr(module, "__all__", []): - if inspect.ismodule(member): - print("SKIPPING MODULE", module.__name__, name) - else: - print("Skipping member", module.__name__, name) + for name, member in inspect.getmembers(obj): + # Filter out non-matching objects with criteria + if criteria is not None and not criteria(obj, name, member): continue - # print("Investigating", name) + # Recurse over the public API of modules (objects defined in __all__) if inspect.ismodule(member): - yield from find_docstrings_in_module(finder, member) + yield from find_docstrings_in_obj( + finder, member, criteria=_name_in_all + ) + # Recurse over the public API of classes (attributes not prefixed with + # an underscore) + if inspect.isclass(member): + yield from find_docstrings_in_obj( + finder, member, criteria=_is_public_name + ) def fetch_doctests(): finder = doctest.DocTestFinder() - yield from find_docstrings_in_module(finder, cudf) + yield from find_docstrings_in_obj(finder, cudf, criteria=_name_in_all) class TestDoctests: From 1ed143a4155b07f3c28f5e57fd88d9f3699d3861 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 15:06:42 -0600 Subject: [PATCH 05/58] Make test methods private. --- python/cudf/cudf/tests/test_doctests.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index f840f44e5f6..b405fdb530c 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -14,7 +14,7 @@ def _is_public_name(parent, name, member): return not name.startswith("_") -def find_docstrings_in_obj(finder, obj, criteria=None): +def _find_docstrings_in_obj(finder, obj, criteria=None): for docstring in finder.find(obj): if docstring.examples: yield docstring @@ -24,25 +24,25 @@ def find_docstrings_in_obj(finder, obj, criteria=None): continue # Recurse over the public API of modules (objects defined in __all__) if inspect.ismodule(member): - yield from find_docstrings_in_obj( + yield from _find_docstrings_in_obj( finder, member, criteria=_name_in_all ) # Recurse over the public API of classes (attributes not prefixed with # an underscore) if inspect.isclass(member): - yield from find_docstrings_in_obj( + yield from _find_docstrings_in_obj( finder, member, criteria=_is_public_name ) -def fetch_doctests(): +def _fetch_doctests(): finder = doctest.DocTestFinder() - yield from find_docstrings_in_obj(finder, cudf, criteria=_name_in_all) + yield from _find_docstrings_in_obj(finder, cudf, criteria=_name_in_all) class TestDoctests: @pytest.mark.parametrize( - "docstring", fetch_doctests(), ids=lambda docstring: docstring.name + "docstring", _fetch_doctests(), ids=lambda docstring: docstring.name ) def test_docstring(self, docstring): optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE From 7155cf4be5a6550f02fbd9fec8bdba95edac50d5 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 20:29:57 -0600 Subject: [PATCH 06/58] Use instead of null. --- python/cudf/cudf/core/_base_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index d688b75ed14..32dacb14e9c 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -488,7 +488,7 @@ def fillna(self, value, downcast=None): >>> import cudf >>> index = cudf.Index([1, 2, None, 4]) >>> index - Int64Index([1, 2, null, 4], dtype='int64') + Int64Index([1, 2, , 4], dtype='int64') >>> index.fillna(3) Int64Index([1, 2, 3, 4], dtype='int64') """ From 36c819f9dcf77e77554cef8d1b50bfcf8d80ed4d Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 20:30:23 -0600 Subject: [PATCH 07/58] Inject globals into doctests. --- python/cudf/cudf/tests/test_doctests.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index b405fdb530c..154859e5050 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -1,6 +1,7 @@ import doctest import inspect +import numpy as np import pytest import cudf @@ -47,6 +48,8 @@ class TestDoctests: def test_docstring(self, docstring): optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE runner = doctest.DocTestRunner(optionflags=optionflags) + globs = dict(np=np,) + docstring.globs = globs runner.run(docstring) results = runner.summarize() if results.failed: From 4f18028c811ce7bba89c6e8b98bf1c720f49e768 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 20:41:33 -0600 Subject: [PATCH 08/58] Add cudf to globals. --- python/cudf/cudf/tests/test_doctests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index 154859e5050..8c318bd19b0 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -48,7 +48,7 @@ class TestDoctests: def test_docstring(self, docstring): optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE runner = doctest.DocTestRunner(optionflags=optionflags) - globs = dict(np=np,) + globs = dict(cudf=cudf, np=np,) docstring.globs = globs runner.run(docstring) results = runner.summarize() From 427a72426c0c98cc4ae9d97fa6f063703bd5bfed Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 20:44:13 -0600 Subject: [PATCH 09/58] Fix Series.dt. --- python/cudf/cudf/core/series.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index cf035ef457d..28daf23d60a 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -512,13 +512,26 @@ def from_pandas(cls, s, nan_as_null=None): @property def dt(self): """ - Accessor object for datetimelike properties of the Series values. + Accessor object for datetime-like properties of the Series values. Examples -------- + >>> s = cudf.Series(cudf.date_range( + ... start='2001-02-03 12:00:00', + ... end='2001-02-03 14:00:00', + ... freq='1H')) >>> s.dt.hour + 0 12 + 1 13 + dtype: int16 >>> s.dt.second + 0 0 + 1 0 + dtype: int16 >>> s.dt.day + 0 3 + 1 3 + dtype: int16 Returns ------- From 46c6435fc2c05bf28ddbbb71917bce1ee7ec5670 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 20:45:05 -0600 Subject: [PATCH 10/58] Fix Series.memory_usage. --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 28daf23d60a..a7dda1f2c3b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1021,7 +1021,7 @@ def memory_usage(self, index=True, deep=False): -------- >>> s = cudf.Series(range(3), index=['a','b','c']) >>> s.memory_usage() - 48 + 43 Not including the index gives the size of the rest of the data, which is necessarily smaller: From 38b2fd8fb29299ba4858f7b294a7d5e9c152f2ae Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 20:45:50 -0600 Subject: [PATCH 11/58] Fix Series.hash_encode(..., use_name=True). --- python/cudf/cudf/core/series.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a7dda1f2c3b..5ab1e7c63fe 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3159,9 +3159,9 @@ def hash_encode(self, stop, use_name=False): encoding by specifying `use_name=True` >>> series.hash_encode(stop=200, use_name=True) - 0 131 - 1 29 - 2 76 + 0 57 + 1 23 + 2 104 dtype: int32 """ warnings.warn( From ac0e1746406950331d2cbda183bf8ecab19af709 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 20:46:59 -0600 Subject: [PATCH 12/58] Fix Series.keys. --- python/cudf/cudf/core/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 5ab1e7c63fe..e899144cdf0 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -3697,7 +3697,7 @@ def keys(self): dtype: int64 >>> sr.keys() - RangeIndex(start=0, stop=6) + RangeIndex(start=0, stop=6, step=1) >>> sr = cudf.Series(['a', 'b', 'c']) >>> sr 0 a @@ -3705,7 +3705,7 @@ def keys(self): 2 c dtype: object >>> sr.keys() - RangeIndex(start=0, stop=3) + RangeIndex(start=0, stop=3, step=1) >>> sr = cudf.Series([1, 2, 3], index=['a', 'b', 'c']) >>> sr a 1 From b380afe5ec25cc9110a8b6bd42f8f5de6127862e Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 20:48:23 -0600 Subject: [PATCH 13/58] Fix Series.drop. --- python/cudf/cudf/core/series.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e899144cdf0..b72f510944f 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -687,10 +687,12 @@ def drop( y 3 2 x 4 y 5 + dtype: int64 >>> s.drop(labels='y', level=1) 0 x 0 1 x 2 2 x 4 + Name: 2, dtype: int64 """ if labels is not None: if index is not None or columns is not None: From 318a0b793dfd9037336cc4211386dc464ff56e41 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 20:48:51 -0600 Subject: [PATCH 14/58] Fix Series.dropna. --- python/cudf/cudf/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index b72f510944f..d000039c1c9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1529,7 +1529,7 @@ def dropna(self, axis=0, inplace=False, how=None): >>> ser 0 1 1 2 - 2 null + 2 dtype: int64 Drop null values from a Series. From 1e4f183eb780ce308b1a5bbeeade8be246000412 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 1 Dec 2021 20:52:50 -0600 Subject: [PATCH 15/58] Fix Series.data, Series.as_mask. --- python/cudf/cudf/core/series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index d000039c1c9..920bd8239f6 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1774,7 +1774,7 @@ def data(self): 3 4 dtype: int64 >>> series.data - + >>> series.data.to_host_array() array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8) @@ -1798,7 +1798,7 @@ def as_mask(self): >>> import cudf >>> s = cudf.Series([True, False, True]) >>> s.as_mask() - + >>> s.as_mask().to_host_array() array([ 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 181, 164, From 2da598d558721a1c740050837c7de91afda5353a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 08:57:56 -0600 Subject: [PATCH 16/58] Fix Series.cat. --- python/cudf/cudf/core/column/categorical.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a2c1f04b2f2..baf477554c3 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -61,7 +61,6 @@ class CategoricalAccessor(ColumnMethods): -------- >>> s = cudf.Series([1,2,3], dtype='category') >>> s - >>> s 0 1 1 2 2 3 From e60e90961467579e69c591bbbb2053b7158d44e0 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 09:07:17 -0600 Subject: [PATCH 17/58] Fix Scalar. --- python/cudf/cudf/core/scalar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 787b28e213c..37bb8e32c5a 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -32,7 +32,7 @@ class Scalar(object): >>> cudf.Scalar(42, dtype='int64') + np.int8(21) Scalar(63, dtype=int64) >>> x = cudf.Scalar(42, dtype='datetime64[s]') - >>> y = cudf.Scalar(21, dtype='timedelta64[ns]) + >>> y = cudf.Scalar(21, dtype='timedelta64[ns]') >>> x - y Scalar(1970-01-01T00:00:41.999999979, dtype=datetime64[ns]) >>> cudf.Series([1,2,3]) + cudf.Scalar(1) From b7443d3a6404e6ef37cd04f7b40f94e35ccc1e3a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 09:10:51 -0600 Subject: [PATCH 18/58] Fix MultiIndex. --- python/cudf/cudf/core/multiindex.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e0c68e56f63..28bf11dcf37 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -190,7 +190,7 @@ def rename(self, names, inplace=False): Renaming each levels of a MultiIndex to specified name: >>> midx = cudf.MultiIndex.from_product( - [('A', 'B'), (2020, 2021)], names=['c1', 'c2']) + ... [('A', 'B'), (2020, 2021)], names=['c1', 'c2']) >>> midx.rename(['lv1', 'lv2']) MultiIndex([('A', 2020), ('A', 2021), @@ -1076,7 +1076,7 @@ def values(self): [4, 2], [5, 1]]) >>> type(midx.values) - + """ return self.to_frame(index=False).values @@ -1577,13 +1577,13 @@ def get_loc(self, key, method=None, tolerance=None): -------- >>> import cudf >>> mi = cudf.MultiIndex.from_tuples( - [('a', 'd'), ('b', 'e'), ('b', 'f')]) + ... [('a', 'd'), ('b', 'e'), ('b', 'f')]) >>> mi.get_loc('b') slice(1, 3, None) >>> mi.get_loc(('b', 'e')) 1 >>> non_monotonic_non_unique_idx = cudf.MultiIndex.from_tuples( - [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')]) + ... [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')]) >>> non_monotonic_non_unique_idx.get_loc('b') # differ from pandas slice(1, 4, 2) @@ -1599,10 +1599,10 @@ def get_loc(self, key, method=None, tolerance=None): >>> import pandas as pd >>> import cudf - >>> x = pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), - (1, 1, 1), (1, 1, 1), (2, 2, 1)] - ) + >>> x = pd.MultiIndex.from_tuples([ + ... (2, 1, 1), (1, 2, 3), (1, 2, 1), + ... (1, 1, 1), (1, 1, 1), (2, 2, 1), + ... ]) >>> x.get_loc(1) array([False, True, True, True, True, False]) >>> cudf.from_pandas(x).get_loc(1) From 9d389b55d661d2e4e1964066f6f5bcba039b96d3 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 09:11:48 -0600 Subject: [PATCH 19/58] Fix IntervalIndex.from_breaks. --- python/cudf/cudf/core/index.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8f905ee6d49..059f012dd16 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2396,9 +2396,7 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None): >>> import cudf >>> import pandas as pd >>> cudf.IntervalIndex.from_breaks([0, 1, 2, 3]) - IntervalIndex([(0, 1], (1, 2], (2, 3]], - closed='right', - dtype='interval[int64]') + IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval') """ if copy: breaks = column.as_column(breaks, dtype=dtype).copy() From 992558a7f37b8990a67b4c1f3b345e5343db013d Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 09:14:34 -0600 Subject: [PATCH 20/58] Fix DatetimeIndex.floor. --- python/cudf/cudf/core/index.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 059f012dd16..a259b659666 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1923,12 +1923,13 @@ def floor(self, field): Examples -------- >>> import cudf - >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:59:59" - ... ,"1999-12-31 18:44:59"]) + >>> gIndex = cudf.DatetimeIndex([ + ... "2020-05-31 08:59:59", + ... "1999-12-31 18:44:59", + ... ]) >>> gIndex.floor("T") - DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], - dtype='datetime64[ns]', freq=None) - """ + DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]') + """ # noqa: E501 out_column = self._values.floor(field) return self.__class__._from_data({self.name: out_column}) From 3864f2685dee791a7af1d737a6d14f044480d2d3 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 09:16:09 -0600 Subject: [PATCH 21/58] Fix DatetimeIndex.ceil. --- python/cudf/cudf/core/index.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index a259b659666..e8fb0c23aa1 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1892,12 +1892,13 @@ def ceil(self, field): Examples -------- >>> import cudf - >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00", - ... "1999-12-31 18:40:00"]) + >>> gIndex = cudf.DatetimeIndex([ + ... "2020-05-31 08:05:42", + ... "1999-12-31 18:40:30", + ... ]) >>> gIndex.ceil("T") - DatetimeIndex(['2020-05-31 08:00:00', '1999-12-31 18:40:00'], - dtype='datetime64[ns]', freq=None) - """ + DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]') + """ # noqa: E501 out_column = self._values.ceil(field) return self.__class__._from_data({self.name: out_column}) From 6460542b24686b36a6429b72ce4bcd248957697c Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 09:18:42 -0600 Subject: [PATCH 22/58] Fix DatetimeIndex. --- python/cudf/cudf/core/index.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e8fb0c23aa1..5b1fc13089c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1528,9 +1528,11 @@ class DatetimeIndex(GenericIndex): -------- >>> import cudf >>> cudf.DatetimeIndex([1, 2, 3, 4], name="a") - DatetimeIndex(['1970-01-01 00:00:00.001000', '1970-01-01 00:00:00.002000', - '1970-01-01 00:00:00.003000', '1970-01-01 00:00:00.004000'], - dtype='datetime64[ms]', name='a') + DatetimeIndex(['1970-01-01 00:00:00.000000001', + '1970-01-01 00:00:00.000000002', + '1970-01-01 00:00:00.000000003', + '1970-01-01 00:00:00.000000004'], + dtype='datetime64[ns]', name='a') """ def __init__( From aba3bdf02166db80bf4d0e358fe8c6120deb41a2 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 09:20:16 -0600 Subject: [PATCH 23/58] Fix DateOffset. --- python/cudf/cudf/core/tools/datetimes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 34d62ffc048..0d60ac2b94a 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -396,10 +396,10 @@ class DateOffset: -------- >>> from cudf import DateOffset >>> ts = cudf.Series([ - "2000-01-01 00:00:00.012345678", - "2000-01-31 00:00:00.012345678", - "2000-02-29 00:00:00.012345678", - ], dtype='datetime64[ns]) + ... "2000-01-01 00:00:00.012345678", + ... "2000-01-31 00:00:00.012345678", + ... "2000-02-29 00:00:00.012345678", + ... ], dtype='datetime64[ns]') >>> ts + DateOffset(months=3) 0 2000-04-01 00:00:00.012345678 1 2000-04-30 00:00:00.012345678 From 9e7627d77406c01fd57213d60f0e6f170c9043f1 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 09:23:13 -0600 Subject: [PATCH 24/58] Fix DataFrame.unstack. --- python/cudf/cudf/core/reshape.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index b2fac7a6140..fcf8cebe887 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -973,6 +973,7 @@ def unstack(df, level, fill_value=None): Examples -------- + >>> df = cudf.DataFrame() >>> df['a'] = [1, 1, 1, 2, 2] >>> df['b'] = [1, 2, 3, 1, 2] >>> df['c'] = [5, 6, 7, 8, 9] From 30b6d75a5534eb0932734b40cdc8c7d9fc19ff78 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 09:23:55 -0600 Subject: [PATCH 25/58] Fix DataFrame.explode. --- python/cudf/cudf/core/dataframe.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c0cb6f1917f..1b9b818b6f5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6302,8 +6302,11 @@ def explode(self, column, ignore_index=False): Examples -------- >>> import cudf - >>> cudf.DataFrame( - {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]}) + >>> df = cudf.DataFrame({ + ... "a": [[1, 2, 3], [], None, [4, 5]], + ... "b": [11, 22, 33, 44], + ... }) + >>> df a b 0 [1, 2, 3] 11 1 [] 22 From 06248a3e5c2bfa1bd0aa1d785bcc8e81af00157d Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 10:39:48 -0600 Subject: [PATCH 26/58] Fix formatting in DataFrame.stack. --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1b9b818b6f5..d0ebcc2848a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5979,7 +5979,7 @@ def stack(self, level=-1, dropna=True): Examples -------- >>> import cudf - >>> df = cudf.DataFrame({'a':[0,1,3], 'b':[1,2,4]}) + >>> df = cudf.DataFrame({'a': [0, 1, 3], 'b': [1, 2, 4]}) >>> df.stack() 0 a 0 b 1 From ff3a713247172086232fb69272485a3d073d242b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 10:40:08 -0600 Subject: [PATCH 27/58] Fix DataFrame.to_csv. --- python/cudf/cudf/utils/ioutils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 0f9d9d53b23..b1ecbe32b5e 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -955,9 +955,9 @@ >>> import cudf >>> filename = 'foo.csv' >>> df = cudf.DataFrame({'x': [0, 1, 2, 3], - 'y': [1.0, 3.3, 2.2, 4.4], - 'z': ['a', 'b', 'c', 'd']}) ->>> df = df.set_index([3, 2, 1, 0]) +... 'y': [1.0, 3.3, 2.2, 4.4], +... 'z': ['a', 'b', 'c', 'd']}) +>>> df = df.set_index(cudf.Series([3, 2, 1, 0])) >>> df.to_csv(filename) """ From dcf2a68ffda3365e9a9c278c10a4229ba678f240 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 10:50:01 -0600 Subject: [PATCH 28/58] Fix DataFrame.query. --- python/cudf/cudf/core/dataframe.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d0ebcc2848a..d62a6193626 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3854,10 +3854,10 @@ def query(self, expr, local_dict=None): Examples -------- - >>> import cudf - >>> a = ('a', [1, 2, 2]) - >>> b = ('b', [3, 4, 5]) - >>> df = cudf.DataFrame([a, b]) + >>> df = cudf.DataFrame({ + ... "a": [1, 2, 2], + ... "b": [3, 4, 5], + ... }) >>> expr = "(a == 2 and b == 4) or (b == 3)" >>> df.query(expr) a b @@ -3873,8 +3873,8 @@ def query(self, expr, local_dict=None): >>> df['datetimes'] = data >>> search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d') >>> df.query('datetimes==@search_date') - datetimes - 1 2018-10-08T00:00:00.000 + datetimes + 1 2018-10-08 Using local_dict: @@ -3885,9 +3885,9 @@ def query(self, expr, local_dict=None): >>> df['datetimes'] = data >>> search_date2 = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d') >>> df.query('datetimes==@search_date', - ... local_dict={'search_date':search_date2}) - datetimes - 1 2018-10-08T00:00:00.000 + ... local_dict={'search_date': search_date2}) + datetimes + 1 2018-10-08 """ # can't use `annotate` decorator here as we inspect the calling # environment. From b6ebe3d95cb197783b849d51b63e5bf9667a22da Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 10:51:40 -0600 Subject: [PATCH 29/58] Fix DataFrame.pivot. --- python/cudf/cudf/core/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index fcf8cebe887..78376e55068 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -891,7 +891,7 @@ def pivot(data, index=None, columns=None, values=None): Examples -------- >>> a = cudf.DataFrame() - >>> a['a'] = [1, 1, 2, 2], + >>> a['a'] = [1, 1, 2, 2] >>> a['b'] = ['a', 'b', 'a', 'b'] >>> a['c'] = [1, 2, 3, 4] >>> a.pivot(index='a', columns='b') From c7c240b1f391f1d3035ffe39ec46fe42c28b6010 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 10:52:41 -0600 Subject: [PATCH 30/58] Fix DataFrame.memory_usage. --- python/cudf/cudf/core/dataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d62a6193626..94be6105b6e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1247,10 +1247,12 @@ def memory_usage(self, index=True, deep=False): object 40000 bool 5000 dtype: int64 + Use a Categorical for efficient storage of an object-dtype column with many repeated values. + >>> df['object'].astype('category').memory_usage(deep=True) - 5048 + 5008 """ if deep: warnings.warn( From b5ecb98f5f574a7c7888506f099e4fa460e39f20 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 10:55:40 -0600 Subject: [PATCH 31/58] Fix DataFrame.info. --- python/cudf/cudf/core/dataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 94be6105b6e..31d2192b497 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4416,11 +4416,13 @@ def info( >>> buffer = io.StringIO() >>> df.info(buf=buffer) >>> s = buffer.getvalue() + >>> # TODO Can we remove this example? It writes a text file every time + >>> # tests run and it does not seem particularly helpful... >>> with open("df_info.txt", "w", ... encoding="utf-8") as f: ... f.write(s) ... - 369 + 362 The `memory_usage` parameter allows deep introspection mode, specially useful for big DataFrames and fine-tune memory optimization: From 0a467810ef9bbe0096a02c7ab486720822919d9a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 10:58:22 -0600 Subject: [PATCH 32/58] Fix DataFrame.groupby. --- python/cudf/cudf/core/groupby/groupby.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index f1d622362e2..c46e58f802e 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1256,9 +1256,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): -------- >>> import cudf >>> import pandas as pd - >>> df = cudf.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df = cudf.DataFrame({ + ... 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.], + ... }) >>> df Animal Max Speed 0 Falcon 380.0 @@ -1272,10 +1273,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): Parrot 25.0 >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] + ... ['Captive', 'Wild', 'Captive', 'Wild']] >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) >>> df = cudf.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - index=index) + ... index=index) >>> df Max Speed Animal Type From ac41f9741199c014c4a3c794f8fef9ed23391eec Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 11:17:18 -0600 Subject: [PATCH 33/58] Fix DataFrame.__getitem__. --- python/cudf/cudf/core/dataframe.py | 48 +++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 31d2192b497..6f323ea87be 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -976,23 +976,34 @@ def __getitem__(self, arg): Examples -------- - >>> df = DataFrame([('a', list(range(20))), - ... ('b', list(range(20))), - ... ('c', list(range(20)))]) - >>> df[:4] # get first 4 rows of all columns + >>> df = cudf.DataFrame({ + ... 'a': list(range(10)), + ... 'b': list(range(10)), + ... 'c': list(range(10)), + ... }) + + Get first 4 rows of all columns. + + >>> df[:4] a b c 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 - >>> df[-5:] # get last 5 rows of all columns - a b c - 15 15 15 15 - 16 16 16 16 - 17 17 17 17 - 18 18 18 18 - 19 19 19 19 - >>> df[['a', 'c']] # get columns a and c + + Get last 5 rows of all columns. + + >>> df[-5:] + a b c + 5 5 5 5 + 6 6 6 6 + 7 7 7 7 + 8 8 8 8 + 9 9 9 9 + + Get columns a and c. + + >>> df[['a', 'c']] a c 0 0 0 1 1 1 @@ -1004,8 +1015,17 @@ def __getitem__(self, arg): 7 7 7 8 8 8 9 9 9 - >>> df[[True, False, True, False]] # mask the entire dataframe, - # returning the rows specified in the boolean mask + + Return the rows specified in the boolean mask. + + >>> df[[True, False, True, False, True, + ... False, True, False, True, False]] + a b c + 0 0 0 0 + 2 2 2 2 + 4 4 4 4 + 6 6 6 6 + 8 8 8 8 """ if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple): return self._get_columns_by_label(arg, downcast=True) From 39a3050a200b7728def8bd2cc14bf27e8dbc9279 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 11:19:54 -0600 Subject: [PATCH 34/58] Fix DataFrame. --- python/cudf/cudf/core/dataframe.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6f323ea87be..c1e98c7dfa5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -469,12 +469,12 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): ... [(t0+ timedelta(seconds=x)) for x in range(n)]) ... }) >>> df - id datetimes - 0 0 2018-10-07T12:00:00.000 - 1 1 2018-10-07T12:00:01.000 - 2 2 2018-10-07T12:00:02.000 - 3 3 2018-10-07T12:00:03.000 - 4 4 2018-10-07T12:00:04.000 + id datetimes + 0 0 2018-10-07 12:00:00 + 1 1 2018-10-07 12:00:01 + 2 2 2018-10-07 12:00:02 + 3 3 2018-10-07 12:00:03 + 4 4 2018-10-07 12:00:04 Build DataFrame via list of rows as tuples: From 72f661d0406d1d5bd36e584310f792c1c50625ce Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 11:21:34 -0600 Subject: [PATCH 35/58] Fix CategoricalIndex. --- python/cudf/cudf/core/index.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 5b1fc13089c..68827d359f6 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2105,11 +2105,11 @@ class CategoricalIndex(GenericIndex): >>> import pandas as pd >>> cudf.CategoricalIndex( ... data=[1, 2, 3, 4], categories=[1, 2], ordered=False, name="a") - CategoricalIndex([1, 2, , ], categories=[1, 2], ordered=False, name='a', dtype='category', name='a') + CategoricalIndex([1, 2, , ], categories=[1, 2], ordered=False, dtype='category', name='a') >>> cudf.CategoricalIndex( ... data=[1, 2, 3, 4], dtype=pd.CategoricalDtype([1, 2, 3]), name="a") - CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], ordered=False, name='a', dtype='category', name='a') + CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], ordered=False, dtype='category', name='a') """ # noqa: E501 def __init__( From 6769688cc0cea9671a5c8f0286120c76f0d91831 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 11:29:37 -0600 Subject: [PATCH 36/58] Fix BaseIndex.to_pandas. --- python/cudf/cudf/core/_base_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 32dacb14e9c..043d05c66eb 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -546,7 +546,7 @@ def to_pandas(self): >>> type(idx.to_pandas()) >>> type(idx) - + """ return pd.Index(self._values.to_pandas(), name=self.name) From 28dbf05ea0cd0ae2110862bcc8dba6c786f3ee26 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 11:30:16 -0600 Subject: [PATCH 37/58] Work on BaseIndex.join - possibly an issue. --- python/cudf/cudf/core/_base_index.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 043d05c66eb..fd6198d783e 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -935,6 +935,7 @@ def is_interval(self): Examples -------- >>> import cudf + >>> import pandas as pd >>> idx = cudf.from_pandas( ... pd.Index([pd.Interval(left=0, right=5), ... pd.Interval(left=5, right=10)]) @@ -1098,15 +1099,16 @@ def join( Examples -------- >>> import cudf - >>> lhs = cudf.DataFrame( - ... {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b'] - ... ).index + >>> lhs = cudf.DataFrame({ + ... "a": [2, 3, 1], + ... "b": [3, 4, 2], + ... }).set_index(['a', 'b']).index >>> lhs MultiIndex([(2, 3), (3, 4), (1, 2)], names=['a', 'b']) - >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index + >>> rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index('a').index >>> rhs Int64Index([1, 4, 3], dtype='int64', name='a') >>> lhs.join(rhs, how='inner') From f881890a2da5e139358e3d2f531d663853b53748 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 15:34:04 -0600 Subject: [PATCH 38/58] Remove to_host_array from Series.as_mask doctests. --- python/cudf/cudf/core/series.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 920bd8239f6..d5daaebecb7 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1799,13 +1799,6 @@ def as_mask(self): >>> s = cudf.Series([True, False, True]) >>> s.as_mask() - >>> s.as_mask().to_host_array() - array([ 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 181, 164, - 188, 1, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, - 127, 253, 214, 62, 241, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - dtype=uint8) """ if not is_bool_dtype(self.dtype): raise TypeError( From 5111a7143446f779c356ca94703dd69280335a3d Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Dec 2021 15:34:40 -0600 Subject: [PATCH 39/58] Match current implementation of DataFrame.describe for datetime values. --- python/cudf/cudf/utils/docutils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 57ad612846d..8894120529d 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -216,12 +216,12 @@ def wrapper(func): dtype: datetime64[s] >>> s.describe() count 3 - mean 2006-09-01 08:00:00.000000000 - min 2000-01-01 00:00:00.000000000 - 25% 2004-12-31 12:00:00.000000000 - 50% 2010-01-01 00:00:00.000000000 - 75% 2010-01-01 00:00:00.000000000 - max 2010-01-01 00:00:00.000000000 + mean 2006-09-01T08:00:00.000000000 + min 2000-01-01 00:00:00 + 25% 2004-12-31 12:00:00 + 50% 2010-01-01 00:00:00 + 75% 2010-01-01 00:00:00 + max 2010-01-01 00:00:00 dtype: object Describing a ``DataFrame``. By default only numeric fields are From 699c21aafc1f604a41dac954bbbe87ddefe8a5f5 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 3 Dec 2021 16:13:14 -0600 Subject: [PATCH 40/58] Fix DataFrame.reindex. Resolves #9827. --- python/cudf/cudf/core/dataframe.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c1e98c7dfa5..d898c2068b4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2287,11 +2287,11 @@ def reindex( 3 3 13.0 4 4 14.0 >>> df_new - key val sum - 0 0 10.0 NaN - 3 3 13.0 NaN - 4 4 14.0 NaN - 5 -1 NaN NaN + key val sum + 0 0 10.0 + 3 3 13.0 + 4 4 14.0 + 5 """ if labels is None and index is None and columns is None: From 3a0cb9fba35a702f5c945b9abe75571a21dddef1 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 4 Jan 2022 15:18:15 -0600 Subject: [PATCH 41/58] Remove TODOs, add isclose to __all__. --- python/cudf/cudf/__init__.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 961438e22bc..4dadf6a1869 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -56,8 +56,6 @@ StructDtype, ) from cudf.core.groupby import Grouper - -# TODO: Math operations like add, arccos, etc. are not exposed in pandas' root namespace. from cudf.core.ops import ( add, arccos, @@ -86,8 +84,6 @@ pivot, unstack, ) - -# TODO: Pandas does not expose isclose in the root namespace. from cudf.core.series import isclose from cudf.core.tools.datetimes import DateOffset, to_datetime from cudf.core.tools.numeric import to_numeric @@ -174,6 +170,7 @@ "from_pandas", "get_dummies", "interval_range", + "isclose", "melt", "merge", "merge_sorted", From 421fe473fe90c4c9a8e5c2575ecedcd1837206c9 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 4 Jan 2022 15:43:08 -0600 Subject: [PATCH 42/58] Print buffer instead of writing a file. --- python/cudf/cudf/core/dataframe.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f79afa7e485..ee3f4b65485 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4337,20 +4337,23 @@ def info( dtypes: float64(1), int64(1), object(1) memory usage: 130.0+ bytes - Pipe output of DataFrame.info to buffer instead of sys.stdout, - get buffer content and writes to a text file: + Pipe output of DataFrame.info to a buffer instead of sys.stdout and + print buffer contents: >>> import io >>> buffer = io.StringIO() >>> df.info(buf=buffer) - >>> s = buffer.getvalue() - >>> # TODO Can we remove this example? It writes a text file every time - >>> # tests run and it does not seem particularly helpful... - >>> with open("df_info.txt", "w", - ... encoding="utf-8") as f: - ... f.write(s) - ... - 362 + >>> print(buffer.getvalue()) + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 130.0+ bytes The `memory_usage` parameter allows deep introspection mode, specially useful for big DataFrames and fine-tune memory optimization: From dd90c8d963fbab4e26330472c93bd622ec8f00c2 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 4 Jan 2022 16:24:43 -0600 Subject: [PATCH 43/58] Run doctests in a temporary path to avoid file I/O in the test directory. --- python/cudf/cudf/tests/test_doctests.py | 27 +++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index 8c318bd19b0..8e08971d8ac 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -1,5 +1,7 @@ import doctest import inspect +import os +from contextlib import AbstractContextManager import numpy as np import pytest @@ -41,16 +43,37 @@ def _fetch_doctests(): yield from _find_docstrings_in_obj(finder, cudf, criteria=_name_in_all) +class _chdir(AbstractContextManager): + """Non thread-safe context manager to change the current working directory. + + Implementation copied from Python's contextlib.chdir, implemented in + October 2021. This is not yet released but can be replaced with + contextlib.chdir in the future. + """ + + def __init__(self, path): + self.path = path + self._old_cwd = [] + + def __enter__(self): + self._old_cwd.append(os.getcwd()) + os.chdir(self.path) + + def __exit__(self, *excinfo): + os.chdir(self._old_cwd.pop()) + + class TestDoctests: @pytest.mark.parametrize( "docstring", _fetch_doctests(), ids=lambda docstring: docstring.name ) - def test_docstring(self, docstring): + def test_docstring(self, docstring, tmp_path): optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE runner = doctest.DocTestRunner(optionflags=optionflags) globs = dict(cudf=cudf, np=np,) docstring.globs = globs - runner.run(docstring) + with _chdir(tmp_path): + runner.run(docstring) results = runner.summarize() if results.failed: raise AssertionError(results) From 4ed7c0f080a78154fbd22a6b932cc9e710f179bd Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 4 Jan 2022 16:51:50 -0600 Subject: [PATCH 44/58] Use a class-scoped autouse fixture for temporary directories. --- python/cudf/cudf/tests/test_doctests.py | 35 ++++++++----------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index 8e08971d8ac..8cecad8520a 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -1,7 +1,6 @@ import doctest import inspect import os -from contextlib import AbstractContextManager import numpy as np import pytest @@ -43,37 +42,25 @@ def _fetch_doctests(): yield from _find_docstrings_in_obj(finder, cudf, criteria=_name_in_all) -class _chdir(AbstractContextManager): - """Non thread-safe context manager to change the current working directory. - - Implementation copied from Python's contextlib.chdir, implemented in - October 2021. This is not yet released but can be replaced with - contextlib.chdir in the future. - """ - - def __init__(self, path): - self.path = path - self._old_cwd = [] - - def __enter__(self): - self._old_cwd.append(os.getcwd()) - os.chdir(self.path) - - def __exit__(self, *excinfo): - os.chdir(self._old_cwd.pop()) - - class TestDoctests: + @pytest.fixture(autouse=True) + def chdir_to_tmp_path(tmp_path): + original_directory = os.getcwd() + try: + os.chdir(tmp_path) + yield + finally: + os.chdir(original_directory) + @pytest.mark.parametrize( "docstring", _fetch_doctests(), ids=lambda docstring: docstring.name ) - def test_docstring(self, docstring, tmp_path): + def test_docstring(self, docstring): optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE runner = doctest.DocTestRunner(optionflags=optionflags) globs = dict(cudf=cudf, np=np,) docstring.globs = globs - with _chdir(tmp_path): - runner.run(docstring) + runner.run(docstring) results = runner.summarize() if results.failed: raise AssertionError(results) From c624a84700cb2285907af6e8bb472631d1e16671 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 4 Jan 2022 17:08:07 -0600 Subject: [PATCH 45/58] Fix fixture. --- python/cudf/cudf/tests/test_doctests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index 8cecad8520a..fdc11571de3 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -44,7 +44,7 @@ def _fetch_doctests(): class TestDoctests: @pytest.fixture(autouse=True) - def chdir_to_tmp_path(tmp_path): + def chdir_to_tmp_path(cls, tmp_path): original_directory = os.getcwd() try: os.chdir(tmp_path) From 21d6cadbfe4a6da6501ef0844c555735f7ba7451 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 13 Jan 2022 11:08:42 -0800 Subject: [PATCH 46/58] Clean up doctests, add comments. --- python/cudf/cudf/tests/test_doctests.py | 53 +++++++++++++++++-------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index fdc11571de3..4f2de193731 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -8,43 +8,52 @@ import cudf -def _name_in_all(parent, name, member): +def _name_in_all(parent, name): return name in getattr(parent, "__all__", []) -def _is_public_name(parent, name, member): +def _is_public_name(parent, name): return not name.startswith("_") -def _find_docstrings_in_obj(finder, obj, criteria=None): +def _find_doctests_in_obj(finder, obj, criteria): + """Find all doctests in an object. + + Args: + finder (doctest.DocTestFinder): The DocTestFinder object to use. + obj (module or class): The object to search for docstring examples. + criteria (callable): Callable indicating whether to recurse over + members of the provided object. + + Yields: + doctest.DocTest: The next doctest found in the object. + """ for docstring in finder.find(obj): if docstring.examples: yield docstring for name, member in inspect.getmembers(obj): - # Filter out non-matching objects with criteria - if criteria is not None and not criteria(obj, name, member): + # Only recurse over members matching the criteria + if not criteria(obj, name): continue - # Recurse over the public API of modules (objects defined in __all__) + # Recurse over the public API of modules (objects defined in the + # module's __all__) if inspect.ismodule(member): - yield from _find_docstrings_in_obj( + yield from _find_doctests_in_obj( finder, member, criteria=_name_in_all ) # Recurse over the public API of classes (attributes not prefixed with # an underscore) if inspect.isclass(member): - yield from _find_docstrings_in_obj( + yield from _find_doctests_in_obj( finder, member, criteria=_is_public_name ) -def _fetch_doctests(): - finder = doctest.DocTestFinder() - yield from _find_docstrings_in_obj(finder, cudf, criteria=_name_in_all) - - class TestDoctests: @pytest.fixture(autouse=True) def chdir_to_tmp_path(cls, tmp_path): + # Some doctests generate files, so this fixture runs the tests in a + # temporary directory. original_directory = os.getcwd() try: os.chdir(tmp_path) @@ -53,13 +62,25 @@ def chdir_to_tmp_path(cls, tmp_path): os.chdir(original_directory) @pytest.mark.parametrize( - "docstring", _fetch_doctests(), ids=lambda docstring: docstring.name + "docstring", + _find_doctests_in_obj( + finder=doctest.DocTestFinder(), obj=cudf, criteria=_name_in_all + ), + ids=lambda docstring: docstring.name, ) def test_docstring(self, docstring): + # We ignore differences in whitespace in the doctest output, and enable + # the use of an ellipsis "..." to match any string in the doctest + # output. An ellipsis is useful for, e.g., memory addresses or + # imprecise floating point values. optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE runner = doctest.DocTestRunner(optionflags=optionflags) - globs = dict(cudf=cudf, np=np,) - docstring.globs = globs + + # These global names are pre-defined and can be used in doctests + # without first importing them. + globals = dict(cudf=cudf, np=np,) + docstring.globs = globals + runner.run(docstring) results = runner.summarize() if results.failed: From a625070a0f8ba2aff7fc3b2356bbf02366ddbd58 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 13 Jan 2022 11:15:28 -0800 Subject: [PATCH 47/58] Fix TimedeltaIndex doctest. --- python/cudf/cudf/core/index.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 768ea6aa638..0bd9b0a5ea6 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2013,14 +2013,15 @@ class TimedeltaIndex(GenericIndex): -------- >>> import cudf >>> cudf.TimedeltaIndex([1132223, 2023232, 342234324, 4234324], - ... dtype='timedelta64[ns]') - TimedeltaIndex(['00:00:00.001132', '00:00:00.002023', '00:00:00.342234', - '00:00:00.004234'], - dtype='timedelta64[ns]') - >>> cudf.TimedeltaIndex([1, 2, 3, 4], dtype='timedelta64[s]', + ... dtype="timedelta64[ns]") + TimedeltaIndex(['0 days 00:00:00.001132223', '0 days 00:00:00.002023232', + '0 days 00:00:00.342234324', '0 days 00:00:00.004234324'], + dtype='timedelta64[ns]') + >>> cudf.TimedeltaIndex([1, 2, 3, 4], dtype="timedelta64[s]", ... name="delta-index") - TimedeltaIndex(['00:00:01', '00:00:02', '00:00:03', '00:00:04'], - dtype='timedelta64[s]', name='delta-index') + TimedeltaIndex(['0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03', + '0 days 00:00:04'], + dtype='timedelta64[s]', name='delta-index') """ def __init__( From 93ad86455593e7e20956d59b510fa0ef29574df9 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 13 Jan 2022 11:17:48 -0800 Subject: [PATCH 48/58] Update formatting of doctest to match current cuDF implementation. --- python/cudf/cudf/utils/docutils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index cc24bbb8346..2fcf996b641 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -225,13 +225,13 @@ def wrapper(func): 2 2010-01-01 dtype: datetime64[s] >>> s.describe() - count 3 - mean 2006-09-01T08:00:00.000000000 - min 2000-01-01 00:00:00 - 25% 2004-12-31 12:00:00 - 50% 2010-01-01 00:00:00 - 75% 2010-01-01 00:00:00 - max 2010-01-01 00:00:00 + count 3 + mean 2006-09-01 08:00:00 + min 2000-01-01 00:00:00 + 25% 2004-12-31 12:00:00 + 50% 2010-01-01 00:00:00 + 75% 2010-01-01 00:00:00 + max 2010-01-01 00:00:00 dtype: object Describing a ``DataFrame``. By default only numeric fields are From 86bdcfa467ae5a15d1571d8d1368c50f6e7f4875 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 13 Jan 2022 11:23:45 -0800 Subject: [PATCH 49/58] Avoid -0.99999999... in autocorrelation to ensure passing doctest if perturbed by numerical error. --- python/cudf/cudf/core/series.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c7f7131ca10..c176b5f5bf9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2812,11 +2812,11 @@ def autocorr(self, lag=1): Examples -------- >>> import cudf - >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05]) + >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05, 0.17]) >>> s.autocorr() - 0.10355263309024071 + 0.1438853844... >>> s.autocorr(lag=2) - -0.9999999999999999 + -0.9647548490... """ return self.corr(self.shift(lag)) From 1ec00877a77c5fd9c558098f012be653e8fdbcdf Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 13 Jan 2022 11:25:51 -0800 Subject: [PATCH 50/58] Fix misordered values in DatetimeIndex.round doctest. --- python/cudf/cudf/core/index.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0bd9b0a5ea6..1e493708415 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1971,7 +1971,9 @@ def round(self, freq): ... "2001-01-01 00:05:04", ... ], dtype="datetime64[ns]") >>> dt_idx - DatetimeIndex(['2001-01-01 00:04:45', '2001-01-01 00:05:04', '2001-01-01 00:04:58'], dtype='datetime64[ns]') + DatetimeIndex(['2001-01-01 00:04:45', '2001-01-01 00:04:58', + '2001-01-01 00:05:04'], + dtype='datetime64[ns]') >>> dt_idx.round('H') DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01'], dtype='datetime64[ns]') >>> dt_idx.round('T') From d6553db05889e9e10ef6e0ded2db88c038903ae5 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 13 Jan 2022 14:21:48 -0800 Subject: [PATCH 51/58] Remove try/finally. --- python/cudf/cudf/tests/test_doctests.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index 4f2de193731..ee6c0bc3896 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -55,11 +55,9 @@ def chdir_to_tmp_path(cls, tmp_path): # Some doctests generate files, so this fixture runs the tests in a # temporary directory. original_directory = os.getcwd() - try: - os.chdir(tmp_path) - yield - finally: - os.chdir(original_directory) + os.chdir(tmp_path) + yield + os.chdir(original_directory) @pytest.mark.parametrize( "docstring", From 8eea41a180bc69c59bffccad57db6ccca8f4f2b3 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 13 Jan 2022 14:27:39 -0800 Subject: [PATCH 52/58] Use assert not... --- python/cudf/cudf/tests/test_doctests.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index ee6c0bc3896..bc7d1bb1d55 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -81,5 +81,4 @@ def test_docstring(self, docstring): runner.run(docstring) results = runner.summarize() - if results.failed: - raise AssertionError(results) + assert not results.failed, results From 95303a3f952cd6c86be6b15050582004997e7696 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 13 Jan 2022 14:29:55 -0800 Subject: [PATCH 53/58] Use NumPy-style docstring. --- python/cudf/cudf/tests/test_doctests.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index bc7d1bb1d55..c78c4753f6d 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -19,14 +19,20 @@ def _is_public_name(parent, name): def _find_doctests_in_obj(finder, obj, criteria): """Find all doctests in an object. - Args: - finder (doctest.DocTestFinder): The DocTestFinder object to use. - obj (module or class): The object to search for docstring examples. - criteria (callable): Callable indicating whether to recurse over - members of the provided object. + Parameters + ---------- + finder : doctest.DocTestFinder + The DocTestFinder object to use. + obj : module or class + The object to search for docstring examples. + criteria : callable + Callable indicating whether to recurse over members of the provided + object. - Yields: - doctest.DocTest: The next doctest found in the object. + Yields + ------ + doctest.DocTest + The next doctest found in the object. """ for docstring in finder.find(obj): if docstring.examples: From f4254fdb77663cf945e31fdf785bca393768fc68 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 13 Jan 2022 14:34:36 -0800 Subject: [PATCH 54/58] Improve defaults in doctest finder. --- python/cudf/cudf/tests/test_doctests.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index c78c4753f6d..c4e4094ad03 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -16,24 +16,30 @@ def _is_public_name(parent, name): return not name.startswith("_") -def _find_doctests_in_obj(finder, obj, criteria): +def _find_doctests_in_obj(obj, finder=None, criteria=None): """Find all doctests in an object. Parameters ---------- - finder : doctest.DocTestFinder - The DocTestFinder object to use. obj : module or class The object to search for docstring examples. - criteria : callable + finder : doctest.DocTestFinder, optional + The DocTestFinder object to use. If not provided, a DocTestFinder is + constructed. + criteria : callable, optional Callable indicating whether to recurse over members of the provided - object. + object. If not provided, names not defined in the object's ``__all__`` + property are ignored. Yields ------ doctest.DocTest The next doctest found in the object. """ + if finder is None: + finder = doctest.DocTestFinder() + if criteria is None: + criteria = _name_in_all for docstring in finder.find(obj): if docstring.examples: yield docstring @@ -45,13 +51,13 @@ def _find_doctests_in_obj(finder, obj, criteria): # module's __all__) if inspect.ismodule(member): yield from _find_doctests_in_obj( - finder, member, criteria=_name_in_all + member, finder, criteria=_name_in_all ) # Recurse over the public API of classes (attributes not prefixed with # an underscore) if inspect.isclass(member): yield from _find_doctests_in_obj( - finder, member, criteria=_is_public_name + member, finder, criteria=_is_public_name ) @@ -67,9 +73,7 @@ def chdir_to_tmp_path(cls, tmp_path): @pytest.mark.parametrize( "docstring", - _find_doctests_in_obj( - finder=doctest.DocTestFinder(), obj=cudf, criteria=_name_in_all - ), + _find_doctests_in_obj(cudf), ids=lambda docstring: docstring.name, ) def test_docstring(self, docstring): From 3110606f9c219b65cbd0bbf7091090a086f7435a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 13 Jan 2022 14:37:46 -0800 Subject: [PATCH 55/58] Remove __all__ from accessor. --- python/cudf/cudf/api/extensions/accessor.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/python/cudf/cudf/api/extensions/accessor.py b/python/cudf/cudf/api/extensions/accessor.py index 524c11f048d..a27ffa90cfc 100644 --- a/python/cudf/cudf/api/extensions/accessor.py +++ b/python/cudf/cudf/api/extensions/accessor.py @@ -159,10 +159,3 @@ def register_index_accessor(name): def register_series_accessor(name): """{docstring}""" return _register_accessor(name, cudf.Series) - - -__all__ = [ - "register_dataframe_accessor", - "register_index_accessor", - "register_series_accessor", -] From f9512ad78ddad8b38eca00897aa62a8014645cba Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 14 Jan 2022 13:12:23 -0800 Subject: [PATCH 56/58] Show doctest failures in the traceback. --- python/cudf/cudf/tests/test_doctests.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py index c4e4094ad03..05d6886c297 100644 --- a/python/cudf/cudf/tests/test_doctests.py +++ b/python/cudf/cudf/tests/test_doctests.py @@ -1,5 +1,7 @@ +import contextlib import doctest import inspect +import io import os import numpy as np @@ -89,6 +91,12 @@ def test_docstring(self, docstring): globals = dict(cudf=cudf, np=np,) docstring.globs = globals - runner.run(docstring) - results = runner.summarize() - assert not results.failed, results + # Capture stdout and include failing outputs in the traceback. + doctest_stdout = io.StringIO() + with contextlib.redirect_stdout(doctest_stdout): + runner.run(docstring) + results = runner.summarize() + assert not results.failed, ( + f"{results.failed} of {results.attempted} doctests failed for " + f"{docstring.name}:\n{doctest_stdout.getvalue()}" + ) From 64a17c78d0e0e8343515136792fc064673c91b01 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 14 Jan 2022 13:35:23 -0800 Subject: [PATCH 57/58] Prevent test_dataframe_to_string from leaking state into the pandas options. --- python/cudf/cudf/tests/test_dataframe.py | 116 +++++++++++------------ 1 file changed, 57 insertions(+), 59 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e5b298a8448..f71e857918d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -747,70 +747,68 @@ def test_index_astype(nelem): def test_dataframe_to_string(): - pd.options.display.max_rows = 5 - pd.options.display.max_columns = 8 - # Test basic - df = cudf.DataFrame( - {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} - ) - string = str(df) - - assert string.splitlines()[-1] == "[6 rows x 2 columns]" - - # Test skipped columns - df = cudf.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [11, 12, 13, 14, 15, 16], - "c": [11, 12, 13, 14, 15, 16], - "d": [11, 12, 13, 14, 15, 16], - } - ) - string = df.to_string() - - assert string.splitlines()[-1] == "[6 rows x 4 columns]" - - # Test masked - df = cudf.DataFrame( - {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} - ) - - data = np.arange(6) - mask = np.zeros(1, dtype=cudf.utils.utils.mask_dtype) - mask[0] = 0b00101101 + with pd.option_context("display.max_rows", 5, "display.max_columns", 8): + # Test basic + df = cudf.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} + ) + string = str(df) - masked = cudf.Series.from_masked_array(data, mask) - assert masked.null_count == 2 - df["c"] = masked + assert string.splitlines()[-1] == "[6 rows x 2 columns]" - # check data - values = masked.copy() - validids = [0, 2, 3, 5] - densearray = masked.dropna().to_numpy() - np.testing.assert_equal(data[validids], densearray) - # valid position is correct + # Test skipped columns + df = cudf.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [11, 12, 13, 14, 15, 16], + "c": [11, 12, 13, 14, 15, 16], + "d": [11, 12, 13, 14, 15, 16], + } + ) + string = df.to_string() - for i in validids: - assert data[i] == values[i] - # null position is correct - for i in range(len(values)): - if i not in validids: - assert values[i] is cudf.NA + assert string.splitlines()[-1] == "[6 rows x 4 columns]" - pd.options.display.max_rows = 10 - got = df.to_string() + # Test masked + df = cudf.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} + ) - expect = """ -a b c -0 1 11 0 -1 2 12 -2 3 13 2 -3 4 14 3 -4 5 15 -5 6 16 5 -""" - # values should match despite whitespace difference - assert got.split() == expect.split() + data = np.arange(6) + mask = np.zeros(1, dtype=cudf.utils.utils.mask_dtype) + mask[0] = 0b00101101 + + masked = cudf.Series.from_masked_array(data, mask) + assert masked.null_count == 2 + df["c"] = masked + + # check data + values = masked.copy() + validids = [0, 2, 3, 5] + densearray = masked.dropna().to_numpy() + np.testing.assert_equal(data[validids], densearray) + # valid position is correct + + for i in validids: + assert data[i] == values[i] + # null position is correct + for i in range(len(values)): + if i not in validids: + assert values[i] is cudf.NA + + with pd.option_context("display.max_rows", 10): + got = df.to_string() + expect = textwrap.dedent( + """\ + a b c + 0 1 11 0 + 1 2 12 + 2 3 13 2 + 3 4 14 3 + 4 5 15 + 5 6 16 5""" + ) + assert got == expect def test_dataframe_to_string_wide(monkeypatch): From e1a19bcfdd619d2014831a229a85ab60756b2edd Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 14 Jan 2022 13:57:13 -0800 Subject: [PATCH 58/58] Split test_dataframe_to_string into multiple tests. --- python/cudf/cudf/tests/test_dataframe.py | 133 ++++++++++++++--------- 1 file changed, 81 insertions(+), 52 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f71e857918d..40d0d0f4fcc 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -746,69 +746,98 @@ def test_index_astype(nelem): np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy()) -def test_dataframe_to_string(): - with pd.option_context("display.max_rows", 5, "display.max_columns", 8): - # Test basic - df = cudf.DataFrame( - {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} - ) - string = str(df) +def test_dataframe_to_string_with_skipped_rows(): + # Test skipped rows + df = cudf.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} + ) - assert string.splitlines()[-1] == "[6 rows x 2 columns]" + with pd.option_context("display.max_rows", 5): + got = df.to_string() - # Test skipped columns - df = cudf.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6], - "b": [11, 12, 13, 14, 15, 16], - "c": [11, 12, 13, 14, 15, 16], - "d": [11, 12, 13, 14, 15, 16], - } - ) - string = df.to_string() + expect = textwrap.dedent( + """\ + a b + 0 1 11 + 1 2 12 + .. .. .. + 4 5 15 + 5 6 16 - assert string.splitlines()[-1] == "[6 rows x 4 columns]" + [6 rows x 2 columns]""" + ) + assert got == expect - # Test masked - df = cudf.DataFrame( - {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} - ) - data = np.arange(6) - mask = np.zeros(1, dtype=cudf.utils.utils.mask_dtype) - mask[0] = 0b00101101 +def test_dataframe_to_string_with_skipped_rows_and_columns(): + # Test skipped rows and skipped columns + df = cudf.DataFrame( + { + "a": [1, 2, 3, 4, 5, 6], + "b": [11, 12, 13, 14, 15, 16], + "c": [11, 12, 13, 14, 15, 16], + "d": [11, 12, 13, 14, 15, 16], + } + ) + + with pd.option_context("display.max_rows", 5, "display.max_columns", 3): + got = df.to_string() - masked = cudf.Series.from_masked_array(data, mask) - assert masked.null_count == 2 - df["c"] = masked + expect = textwrap.dedent( + """\ + a ... d + 0 1 ... 11 + 1 2 ... 12 + .. .. ... .. + 4 5 ... 15 + 5 6 ... 16 - # check data - values = masked.copy() - validids = [0, 2, 3, 5] - densearray = masked.dropna().to_numpy() - np.testing.assert_equal(data[validids], densearray) - # valid position is correct + [6 rows x 4 columns]""" + ) + assert got == expect - for i in validids: - assert data[i] == values[i] - # null position is correct - for i in range(len(values)): - if i not in validids: - assert values[i] is cudf.NA + +def test_dataframe_to_string_with_masked_data(): + # Test masked data + df = cudf.DataFrame( + {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} + ) + + data = np.arange(6) + mask = np.zeros(1, dtype=cudf.utils.utils.mask_dtype) + mask[0] = 0b00101101 + + masked = cudf.Series.from_masked_array(data, mask) + assert masked.null_count == 2 + df["c"] = masked + + # Check data + values = masked.copy() + validids = [0, 2, 3, 5] + densearray = masked.dropna().to_numpy() + np.testing.assert_equal(data[validids], densearray) + # Valid position is correct + for i in validids: + assert data[i] == values[i] + # Null position is correct + for i in range(len(values)): + if i not in validids: + assert values[i] is cudf.NA with pd.option_context("display.max_rows", 10): got = df.to_string() - expect = textwrap.dedent( - """\ - a b c - 0 1 11 0 - 1 2 12 - 2 3 13 2 - 3 4 14 3 - 4 5 15 - 5 6 16 5""" - ) - assert got == expect + + expect = textwrap.dedent( + """\ + a b c + 0 1 11 0 + 1 2 12 + 2 3 13 2 + 3 4 14 3 + 4 5 15 + 5 6 16 5""" + ) + assert got == expect def test_dataframe_to_string_wide(monkeypatch):