diff --git a/cpp/src/search/search.cu b/cpp/src/search/search.cu index 241b3c595f1..81ed3cfbd51 100644 --- a/cpp/src/search/search.cu +++ b/cpp/src/search/search.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. + * Copyright (c) 2019-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -173,11 +173,56 @@ bool contains_scalar_dispatch::operator()(column_view const&, } template <> -bool contains_scalar_dispatch::operator()(column_view const&, - scalar const&, - rmm::cuda_stream_view) +bool contains_scalar_dispatch::operator()(column_view const& col, + scalar const& value, + rmm::cuda_stream_view stream) { - CUDF_FAIL("struct_view type not supported yet"); + CUDF_EXPECTS(col.type() == value.type(), "scalar and column types must match"); + + auto const scalar_table = static_cast(&value)->view(); + CUDF_EXPECTS(col.num_children() == scalar_table.num_columns(), + "struct scalar and structs column must have the same number of children"); + for (size_type i = 0; i < col.num_children(); ++i) { + CUDF_EXPECTS(col.child(i).type() == scalar_table.column(i).type(), + "scalar and column children types must match"); + } + + // Prepare to flatten the structs column and scalar. + auto const has_null_elements = + has_nested_nulls(table_view{std::vector{col.child_begin(), col.child_end()}}) || + has_nested_nulls(scalar_table); + auto const flatten_nullability = has_null_elements + ? structs::detail::column_nullability::FORCE + : structs::detail::column_nullability::MATCH_INCOMING; + + // Flatten the input structs column, only materialize the bitmask if there is null in the input. + auto const col_flattened = + structs::detail::flatten_nested_columns(table_view{{col}}, {}, {}, flatten_nullability); + auto const val_flattened = + structs::detail::flatten_nested_columns(scalar_table, {}, {}, flatten_nullability); + + // The struct scalar only contains the struct member columns. + // Thus, if there is any null in the input, we must exclude the first column in the flattened + // table of the input column from searching because that column is the materialized bitmask of + // the input structs column. + auto const col_flattened_content = col_flattened.flattened_columns(); + auto const col_flattened_children = table_view{std::vector{ + col_flattened_content.begin() + static_cast(has_null_elements), + col_flattened_content.end()}}; + + auto const d_col_children_ptr = table_device_view::create(col_flattened_children, stream); + auto const d_val_ptr = table_device_view::create(val_flattened, stream); + + auto const start_iter = thrust::make_counting_iterator(0); + auto const end_iter = start_iter + col.size(); + auto const comp = row_equality_comparator( + nullate::DYNAMIC{has_null_elements}, *d_col_children_ptr, *d_val_ptr, null_equality::EQUAL); + auto const found_iter = thrust::find_if( + rmm::exec_policy(stream), start_iter, end_iter, [comp] __device__(auto const idx) { + return comp(idx, 0); // compare col[idx] == val[0]. + }); + + return found_iter != end_iter; } template <> @@ -203,7 +248,6 @@ namespace detail { bool contains(column_view const& col, scalar const& value, rmm::cuda_stream_view stream) { if (col.is_empty()) { return false; } - if (not value.is_valid(stream)) { return col.has_nulls(); } return cudf::type_dispatcher(col.type(), contains_scalar_dispatch{}, col, value, stream); @@ -264,20 +308,14 @@ struct multi_contains_dispatch { template <> std::unique_ptr multi_contains_dispatch::operator()( - column_view const& haystack, - column_view const& needles, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + column_view const&, column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) { CUDF_FAIL("list_view type not supported"); } template <> std::unique_ptr multi_contains_dispatch::operator()( - column_view const& haystack, - column_view const& needles, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) + column_view const&, column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) { CUDF_FAIL("struct_view type not supported"); } diff --git a/cpp/tests/search/search_struct_test.cpp b/cpp/tests/search/search_struct_test.cpp index db2ecb89d6a..a1f0b1d81cf 100644 --- a/cpp/tests/search/search_struct_test.cpp +++ b/cpp/tests/search/search_struct_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -35,15 +36,14 @@ constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_leve constexpr int32_t null{0}; // Mark for null child elements constexpr int32_t XXX{0}; // Mark for null struct elements -template -struct TypedStructSearchTest : public cudf::test::BaseFixture { -}; - using TestTypes = cudf::test::Concat; +template +struct TypedStructSearchTest : public cudf::test::BaseFixture { +}; TYPED_TEST_SUITE(TypedStructSearchTest, TestTypes); namespace { @@ -353,3 +353,234 @@ TYPED_TEST(TypedStructSearchTest, ComplexStructTest) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_bound, results.first->view(), verbosity); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_upper_bound, results.second->view(), verbosity); } + +template +struct TypedScalarStructContainTest : public cudf::test::BaseFixture { +}; +TYPED_TEST_SUITE(TypedScalarStructContainTest, TestTypes); + +TYPED_TEST(TypedScalarStructContainTest, EmptyInputTest) +{ + using col_wrapper = cudf::test::fixed_width_column_wrapper; + + auto const col = [] { + auto child = col_wrapper{}; + return structs_col{{child}}; + }(); + + auto const val = [] { + auto child = col_wrapper{1}; + return cudf::struct_scalar(std::vector{child}); + }(); + + EXPECT_EQ(false, cudf::contains(col, val)); +} + +TYPED_TEST(TypedScalarStructContainTest, TrivialInputTests) +{ + using col_wrapper = cudf::test::fixed_width_column_wrapper; + + auto const col = [] { + auto child1 = col_wrapper{1, 2, 3}; + auto child2 = col_wrapper{4, 5, 6}; + auto child3 = strings_col{"x", "y", "z"}; + return structs_col{{child1, child2, child3}}; + }(); + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"x"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"a"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); +} + +TYPED_TEST(TypedScalarStructContainTest, SlicedColumnInputTests) +{ + using col_wrapper = cudf::test::fixed_width_column_wrapper; + + constexpr int32_t dont_care{0}; + + auto const col_original = [] { + auto child1 = col_wrapper{dont_care, dont_care, 1, 2, 3, dont_care}; + auto child2 = col_wrapper{dont_care, dont_care, 4, 5, 6, dont_care}; + auto child3 = strings_col{"dont_care", "dont_care", "x", "y", "z", "dont_care"}; + return structs_col{{child1, child2, child3}}; + }(); + auto const col = cudf::slice(col_original, {2, 5})[0]; + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"x"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{dont_care}; + auto child2 = col_wrapper{dont_care}; + auto child3 = strings_col{"dont_care"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); +} + +TYPED_TEST(TypedScalarStructContainTest, SimpleInputWithNullsTests) +{ + using col_wrapper = cudf::test::fixed_width_column_wrapper; + + constexpr int32_t null{0}; + + // Test with nulls at the top level. + { + auto const col = [] { + auto child1 = col_wrapper{1, null, 3}; + auto child2 = col_wrapper{4, null, 6}; + auto child3 = strings_col{"x", "" /*NULL*/, "z"}; + return structs_col{{child1, child2, child3}, null_at(1)}; + }(); + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"x"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"a"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); + } + + // Test with nulls at the children level. + { + auto const col = [] { + auto child1 = col_wrapper{{1, null, 3}, null_at(1)}; + auto child2 = col_wrapper{{4, null, 6}, null_at(1)}; + auto child3 = strings_col{{"" /*NULL*/, "y", "z"}, null_at(0)}; + return structs_col{{child1, child2, child3}}; + }(); + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{{"" /*NULL*/}, null_at(0)}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{""}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); + } + + // Test with nulls in the input scalar. + { + auto const col = [] { + auto child1 = col_wrapper{1, 2, 3}; + auto child2 = col_wrapper{4, 5, 6}; + auto child3 = strings_col{"x", "y", "z"}; + return structs_col{{child1, child2, child3}}; + }(); + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"x"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{{"" /*NULL*/}, null_at(0)}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); + } +} + +TYPED_TEST(TypedScalarStructContainTest, SlicedInputWithNullsTests) +{ + using col_wrapper = cudf::test::fixed_width_column_wrapper; + + constexpr int32_t dont_care{0}; + constexpr int32_t null{0}; + + // Test with nulls at the top level. + { + auto const col_original = [] { + auto child1 = col_wrapper{dont_care, dont_care, 1, null, 3, dont_care}; + auto child2 = col_wrapper{dont_care, dont_care, 4, null, 6, dont_care}; + auto child3 = strings_col{"dont_care", "dont_care", "x", "" /*NULL*/, "z", "dont_care"}; + return structs_col{{child1, child2, child3}, null_at(3)}; + }(); + auto const col = cudf::slice(col_original, {2, 5})[0]; + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"x"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{"a"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); + } + + // Test with nulls at the children level. + { + auto const col_original = [] { + auto child1 = + col_wrapper{{dont_care, dont_care /*also NULL*/, 1, null, 3, dont_care}, null_at(3)}; + auto child2 = + col_wrapper{{dont_care, dont_care /*also NULL*/, 4, null, 6, dont_care}, null_at(3)}; + auto child3 = strings_col{ + {"dont_care", "dont_care" /*also NULL*/, "" /*NULL*/, "y", "z", "dont_care"}, null_at(2)}; + return structs_col{{child1, child2, child3}, null_at(1)}; + }(); + auto const col = cudf::slice(col_original, {2, 5})[0]; + + auto const val1 = [] { + auto child1 = col_wrapper{1}; + auto child2 = col_wrapper{4}; + auto child3 = strings_col{{"x"}, null_at(0)}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + auto const val2 = [] { + auto child1 = col_wrapper{dont_care}; + auto child2 = col_wrapper{dont_care}; + auto child3 = strings_col{"dont_care"}; + return cudf::struct_scalar(std::vector{child1, child2, child3}); + }(); + + EXPECT_EQ(true, cudf::contains(col, val1)); + EXPECT_EQ(false, cudf::contains(col, val2)); + } +} diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 2461e7b09bc..4dadf6a1869 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -126,3 +126,66 @@ __version__ = get_versions()["version"] del get_versions + +__all__ = [ + "BaseIndex", + "CategoricalDtype", + "CategoricalIndex", + "DataFrame", + "DateOffset", + "DatetimeIndex", + "Decimal32Dtype", + "Decimal64Dtype", + "Float32Index", + "Float64Index", + "GenericIndex", + "Grouper", + "Index", + "Int16Index", + "Int32Index", + "Int64Index", + "Int8Index", + "IntervalDtype", + "IntervalIndex", + "ListDtype", + "MultiIndex", + "NA", + "RangeIndex", + "Scalar", + "Series", + "StringIndex", + "StructDtype", + "TimedeltaIndex", + "UInt16Index", + "UInt32Index", + "UInt64Index", + "UInt8Index", + "api", + "concat", + "cut", + "date_range", + "factorize", + "from_dataframe", + "from_dlpack", + "from_pandas", + "get_dummies", + "interval_range", + "isclose", + "melt", + "merge", + "merge_sorted", + "pivot", + "read_avro", + "read_csv", + "read_feather", + "read_hdf", + "read_json", + "read_orc", + "read_parquet", + "read_text", + "set_allocator", + "testing", + "to_datetime", + "to_numeric", + "unstack", +] diff --git a/python/cudf/cudf/api/__init__.py b/python/cudf/cudf/api/__init__.py index 21c24015e41..c66bfb4efeb 100644 --- a/python/cudf/cudf/api/__init__.py +++ b/python/cudf/cudf/api/__init__.py @@ -1,3 +1,5 @@ # Copyright (c) 2021, NVIDIA CORPORATION. -from cudf.api import types +from cudf.api import extensions, types + +__all__ = ["extensions", "types"] diff --git a/python/cudf/cudf/api/extensions/__init__.py b/python/cudf/cudf/api/extensions/__init__.py index c971e6f7731..eeb5dcdb32a 100644 --- a/python/cudf/cudf/api/extensions/__init__.py +++ b/python/cudf/cudf/api/extensions/__init__.py @@ -5,3 +5,9 @@ register_index_accessor, register_series_accessor, ) + +__all__ = [ + "register_dataframe_accessor", + "register_index_accessor", + "register_series_accessor", +] diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 683f3fefe1c..4f2614e843f 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -495,7 +495,7 @@ def fillna(self, value, downcast=None): >>> import cudf >>> index = cudf.Index([1, 2, None, 4]) >>> index - Int64Index([1, 2, null, 4], dtype='int64') + Int64Index([1, 2, , 4], dtype='int64') >>> index.fillna(3) Int64Index([1, 2, 3, 4], dtype='int64') """ @@ -553,7 +553,7 @@ def to_pandas(self): >>> type(idx.to_pandas()) >>> type(idx) - + """ return pd.Index(self._values.to_pandas(), name=self.name) @@ -942,6 +942,7 @@ def is_interval(self): Examples -------- >>> import cudf + >>> import pandas as pd >>> idx = cudf.from_pandas( ... pd.Index([pd.Interval(left=0, right=5), ... pd.Interval(left=5, right=10)]) @@ -1105,15 +1106,16 @@ def join( Examples -------- >>> import cudf - >>> lhs = cudf.DataFrame( - ... {"a":[2, 3, 1], "b":[3, 4, 2]}).set_index(['a', 'b'] - ... ).index + >>> lhs = cudf.DataFrame({ + ... "a": [2, 3, 1], + ... "b": [3, 4, 2], + ... }).set_index(['a', 'b']).index >>> lhs MultiIndex([(2, 3), (3, 4), (1, 2)], names=['a', 'b']) - >>> rhs = cudf.DataFrame({"a":[1, 4, 3]}).set_index('a').index + >>> rhs = cudf.DataFrame({"a": [1, 4, 3]}).set_index('a').index >>> rhs Int64Index([1, 4, 3], dtype='int64', name='a') >>> lhs.join(rhs, how='inner') diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 4be7a422de0..de06e62cbb1 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -61,7 +61,6 @@ class CategoricalAccessor(ColumnMethods): -------- >>> s = cudf.Series([1,2,3], dtype='category') >>> s - >>> s 0 1 1 2 2 3 diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 123f86cc200..69600426ec0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -463,12 +463,12 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): ... [(t0+ timedelta(seconds=x)) for x in range(n)]) ... }) >>> df - id datetimes - 0 0 2018-10-07T12:00:00.000 - 1 1 2018-10-07T12:00:01.000 - 2 2 2018-10-07T12:00:02.000 - 3 3 2018-10-07T12:00:03.000 - 4 4 2018-10-07T12:00:04.000 + id datetimes + 0 0 2018-10-07 12:00:00 + 1 1 2018-10-07 12:00:01 + 2 2 2018-10-07 12:00:02 + 3 3 2018-10-07 12:00:03 + 4 4 2018-10-07 12:00:04 Build DataFrame via list of rows as tuples: @@ -984,23 +984,34 @@ def __getitem__(self, arg): Examples -------- - >>> df = DataFrame([('a', list(range(20))), - ... ('b', list(range(20))), - ... ('c', list(range(20)))]) - >>> df[:4] # get first 4 rows of all columns + >>> df = cudf.DataFrame({ + ... 'a': list(range(10)), + ... 'b': list(range(10)), + ... 'c': list(range(10)), + ... }) + + Get first 4 rows of all columns. + + >>> df[:4] a b c 0 0 0 0 1 1 1 1 2 2 2 2 3 3 3 3 - >>> df[-5:] # get last 5 rows of all columns - a b c - 15 15 15 15 - 16 16 16 16 - 17 17 17 17 - 18 18 18 18 - 19 19 19 19 - >>> df[['a', 'c']] # get columns a and c + + Get last 5 rows of all columns. + + >>> df[-5:] + a b c + 5 5 5 5 + 6 6 6 6 + 7 7 7 7 + 8 8 8 8 + 9 9 9 9 + + Get columns a and c. + + >>> df[['a', 'c']] a c 0 0 0 1 1 1 @@ -1012,8 +1023,17 @@ def __getitem__(self, arg): 7 7 7 8 8 8 9 9 9 - >>> df[[True, False, True, False]] # mask the entire dataframe, - # returning the rows specified in the boolean mask + + Return the rows specified in the boolean mask. + + >>> df[[True, False, True, False, True, + ... False, True, False, True, False]] + a b c + 0 0 0 0 + 2 2 2 2 + 4 4 4 4 + 6 6 6 6 + 8 8 8 8 """ if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple): return self._get_columns_by_label(arg, downcast=True) @@ -1123,7 +1143,15 @@ def __setitem__(self, arg, value): for col_name in self._data: self._data[col_name][mask] = value else: - if isinstance(value, DataFrame): + if isinstance(value, (cupy.ndarray, np.ndarray)): + _setitem_with_dataframe( + input_df=self, + replace_df=cudf.DataFrame(value), + input_cols=arg, + mask=None, + ignore_index=True, + ) + elif isinstance(value, DataFrame): _setitem_with_dataframe( input_df=self, replace_df=value, @@ -1253,10 +1281,12 @@ def memory_usage(self, index=True, deep=False): object 40000 bool 5000 dtype: int64 + Use a Categorical for efficient storage of an object-dtype column with many repeated values. + >>> df['object'].astype('category').memory_usage(deep=True) - 5048 + 5008 """ if deep: warnings.warn( @@ -1548,10 +1578,18 @@ def _concat( cudf.core.index.as_index(out.index._values) ) - # Reassign precision for any decimal cols + # Reassign precision for decimal cols & type schema for struct cols for name, col in out._data.items(): - if isinstance(col, cudf.core.column.Decimal64Column): - col = col._with_type_metadata(tables[0]._data[name].dtype) + if isinstance( + col, + ( + cudf.core.column.Decimal64Column, + cudf.core.column.StructColumn, + ), + ): + out._data[name] = col._with_type_metadata( + tables[0]._data[name].dtype + ) # Reassign index and column names if isinstance(objs[0].columns, pd.MultiIndex): @@ -2209,11 +2247,11 @@ def reindex( 3 3 13.0 4 4 14.0 >>> df_new - key val sum - 0 0 10.0 NaN - 3 3 13.0 NaN - 4 4 14.0 NaN - 5 -1 NaN NaN + key val sum + 0 0 10.0 + 3 3 13.0 + 4 4 14.0 + 5 """ if labels is None and index is None and columns is None: @@ -3685,10 +3723,10 @@ def query(self, expr, local_dict=None): Examples -------- - >>> import cudf - >>> a = ('a', [1, 2, 2]) - >>> b = ('b', [3, 4, 5]) - >>> df = cudf.DataFrame([a, b]) + >>> df = cudf.DataFrame({ + ... "a": [1, 2, 2], + ... "b": [3, 4, 5], + ... }) >>> expr = "(a == 2 and b == 4) or (b == 3)" >>> df.query(expr) a b @@ -3704,8 +3742,8 @@ def query(self, expr, local_dict=None): >>> df['datetimes'] = data >>> search_date = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d') >>> df.query('datetimes==@search_date') - datetimes - 1 2018-10-08T00:00:00.000 + datetimes + 1 2018-10-08 Using local_dict: @@ -3716,9 +3754,9 @@ def query(self, expr, local_dict=None): >>> df['datetimes'] = data >>> search_date2 = datetime.datetime.strptime('2018-10-08', '%Y-%m-%d') >>> df.query('datetimes==@search_date', - ... local_dict={'search_date':search_date2}) - datetimes - 1 2018-10-08T00:00:00.000 + ... local_dict={'search_date': search_date2}) + datetimes + 1 2018-10-08 """ # can't use `annotate` decorator here as we inspect the calling # environment. @@ -4173,18 +4211,23 @@ def info( dtypes: float64(1), int64(1), object(1) memory usage: 130.0+ bytes - Pipe output of DataFrame.info to buffer instead of sys.stdout, - get buffer content and writes to a text file: + Pipe output of DataFrame.info to a buffer instead of sys.stdout and + print buffer contents: >>> import io >>> buffer = io.StringIO() >>> df.info(buf=buffer) - >>> s = buffer.getvalue() - >>> with open("df_info.txt", "w", - ... encoding="utf-8") as f: - ... f.write(s) - ... - 369 + >>> print(buffer.getvalue()) + + RangeIndex: 5 entries, 0 to 4 + Data columns (total 3 columns): + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 130.0+ bytes The `memory_usage` parameter allows deep introspection mode, specially useful for big DataFrames and fine-tune memory optimization: @@ -5745,7 +5788,7 @@ def stack(self, level=-1, dropna=True): Examples -------- >>> import cudf - >>> df = cudf.DataFrame({'a':[0,1,3], 'b':[1,2,4]}) + >>> df = cudf.DataFrame({'a': [0, 1, 3], 'b': [1, 2, 4]}) >>> df.stack() 0 a 0 b 1 @@ -6068,8 +6111,11 @@ def explode(self, column, ignore_index=False): Examples -------- >>> import cudf - >>> cudf.DataFrame( - {"a": [[1, 2, 3], [], None, [4, 5]], "b": [11, 22, 33, 44]}) + >>> df = cudf.DataFrame({ + ... "a": [[1, 2, 3], [], None, [4, 5]], + ... "b": [11, 22, 33, 44], + ... }) + >>> df a b 0 [1, 2, 3] 11 1 [] 22 @@ -6393,6 +6439,7 @@ def _setitem_with_dataframe( replace_df: DataFrame, input_cols: Any = None, mask: Optional[cudf.core.column.ColumnBase] = None, + ignore_index: bool = False, ): """ This function sets item dataframes relevant columns with replacement df @@ -6400,6 +6447,7 @@ def _setitem_with_dataframe( :param replace_df: Replacement DataFrame to replace values with :param input_cols: columns to replace in the input dataframe :param mask: boolean mask in case of masked replacing + :param ignore_index: Whether to conduct index equality and reindex """ if input_cols is None: @@ -6410,7 +6458,11 @@ def _setitem_with_dataframe( "Number of Input Columns must be same replacement Dataframe" ) - if len(input_df) != 0 and not input_df.index.equals(replace_df.index): + if ( + not ignore_index + and len(input_df) != 0 + and not input_df.index.equals(replace_df.index) + ): replace_df = replace_df.reindex(input_df.index) for col_1, col_2 in zip(input_cols, replace_df.columns): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 08ef3f07776..5b041ba53b9 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1256,9 +1256,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): -------- >>> import cudf >>> import pandas as pd - >>> df = cudf.DataFrame({'Animal': ['Falcon', 'Falcon', - ... 'Parrot', 'Parrot'], - ... 'Max Speed': [380., 370., 24., 26.]}) + >>> df = cudf.DataFrame({ + ... 'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'], + ... 'Max Speed': [380., 370., 24., 26.], + ... }) >>> df Animal Max Speed 0 Falcon 380.0 @@ -1272,10 +1273,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): Parrot 25.0 >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], - ... ['Captive', 'Wild', 'Captive', 'Wild']] + ... ['Captive', 'Wild', 'Captive', 'Wild']] >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) >>> df = cudf.DataFrame({'Max Speed': [390., 350., 30., 20.]}, - index=index) + ... index=index) >>> df Max Speed Animal Type diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 859a81bc5f4..1e493708415 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1535,9 +1535,11 @@ class DatetimeIndex(GenericIndex): -------- >>> import cudf >>> cudf.DatetimeIndex([1, 2, 3, 4], name="a") - DatetimeIndex(['1970-01-01 00:00:00.001000', '1970-01-01 00:00:00.002000', - '1970-01-01 00:00:00.003000', '1970-01-01 00:00:00.004000'], - dtype='datetime64[ms]', name='a') + DatetimeIndex(['1970-01-01 00:00:00.000000001', + '1970-01-01 00:00:00.000000002', + '1970-01-01 00:00:00.000000003', + '1970-01-01 00:00:00.000000004'], + dtype='datetime64[ns]', name='a') """ def __init__( @@ -1899,12 +1901,13 @@ def ceil(self, freq): Examples -------- >>> import cudf - >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:00:00", - ... "1999-12-31 18:40:00"]) + >>> gIndex = cudf.DatetimeIndex([ + ... "2020-05-31 08:05:42", + ... "1999-12-31 18:40:30", + ... ]) >>> gIndex.ceil("T") - DatetimeIndex(['2020-05-31 08:00:00', '1999-12-31 18:40:00'], - dtype='datetime64[ns]', freq=None) - """ + DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]') + """ # noqa: E501 out_column = self._values.ceil(freq) return self.__class__._from_data({self.name: out_column}) @@ -1930,12 +1933,13 @@ def floor(self, freq): Examples -------- >>> import cudf - >>> gIndex = cudf.DatetimeIndex(["2020-05-31 08:59:59" - ... ,"1999-12-31 18:44:59"]) + >>> gIndex = cudf.DatetimeIndex([ + ... "2020-05-31 08:59:59", + ... "1999-12-31 18:44:59", + ... ]) >>> gIndex.floor("T") - DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], - dtype='datetime64[ns]', freq=None) - """ + DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]') + """ # noqa: E501 out_column = self._values.floor(freq) return self.__class__._from_data({self.name: out_column}) @@ -1967,21 +1971,14 @@ def round(self, freq): ... "2001-01-01 00:05:04", ... ], dtype="datetime64[ns]") >>> dt_idx - DatetimeIndex(['2001-01-01 00:04:45', - '2001-01-01 00:05:04', - '2001-01-01 00:04:58'], - dtype='datetime64[ns]', freq=None) + DatetimeIndex(['2001-01-01 00:04:45', '2001-01-01 00:04:58', + '2001-01-01 00:05:04'], + dtype='datetime64[ns]') >>> dt_idx.round('H') - DatetimeIndex(['2001-01-01', - '2001-01-01', - '2001-01-01'], - dtype='datetime64[ns]', freq=None) + DatetimeIndex(['2001-01-01', '2001-01-01', '2001-01-01'], dtype='datetime64[ns]') >>> dt_idx.round('T') - DatetimeIndex(['2001-01-01 00:05:00', - '2001-01-01 00:05:00', - '2001-01-01 00:05:00'], - dtype='datetime64[ns]', freq=None) - """ + DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]') + """ # noqa: E501 out_column = self._values.round(freq) return self.__class__._from_data({self.name: out_column}) @@ -2018,14 +2015,15 @@ class TimedeltaIndex(GenericIndex): -------- >>> import cudf >>> cudf.TimedeltaIndex([1132223, 2023232, 342234324, 4234324], - ... dtype='timedelta64[ns]') - TimedeltaIndex(['00:00:00.001132', '00:00:00.002023', '00:00:00.342234', - '00:00:00.004234'], - dtype='timedelta64[ns]') - >>> cudf.TimedeltaIndex([1, 2, 3, 4], dtype='timedelta64[s]', + ... dtype="timedelta64[ns]") + TimedeltaIndex(['0 days 00:00:00.001132223', '0 days 00:00:00.002023232', + '0 days 00:00:00.342234324', '0 days 00:00:00.004234324'], + dtype='timedelta64[ns]') + >>> cudf.TimedeltaIndex([1, 2, 3, 4], dtype="timedelta64[s]", ... name="delta-index") - TimedeltaIndex(['00:00:01', '00:00:02', '00:00:03', '00:00:04'], - dtype='timedelta64[s]', name='delta-index') + TimedeltaIndex(['0 days 00:00:01', '0 days 00:00:02', '0 days 00:00:03', + '0 days 00:00:04'], + dtype='timedelta64[s]', name='delta-index') """ def __init__( @@ -2154,11 +2152,11 @@ class CategoricalIndex(GenericIndex): >>> import pandas as pd >>> cudf.CategoricalIndex( ... data=[1, 2, 3, 4], categories=[1, 2], ordered=False, name="a") - CategoricalIndex([1, 2, , ], categories=[1, 2], ordered=False, name='a', dtype='category', name='a') + CategoricalIndex([1, 2, , ], categories=[1, 2], ordered=False, dtype='category', name='a') >>> cudf.CategoricalIndex( ... data=[1, 2, 3, 4], dtype=pd.CategoricalDtype([1, 2, 3]), name="a") - CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], ordered=False, name='a', dtype='category', name='a') + CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], ordered=False, dtype='category', name='a') """ # noqa: E501 def __init__( @@ -2449,9 +2447,7 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None): >>> import cudf >>> import pandas as pd >>> cudf.IntervalIndex.from_breaks([0, 1, 2, 3]) - IntervalIndex([(0, 1], (1, 2], (2, 3]], - closed='right', - dtype='interval[int64]') + IntervalIndex([(0, 1], (1, 2], (2, 3]], dtype='interval') """ if copy: breaks = column.as_column(breaks, dtype=dtype).copy() diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index b333c862f21..3acc947c649 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -190,7 +190,7 @@ def rename(self, names, inplace=False): Renaming each levels of a MultiIndex to specified name: >>> midx = cudf.MultiIndex.from_product( - [('A', 'B'), (2020, 2021)], names=['c1', 'c2']) + ... [('A', 'B'), (2020, 2021)], names=['c1', 'c2']) >>> midx.rename(['lv1', 'lv2']) MultiIndex([('A', 2020), ('A', 2021), @@ -1086,7 +1086,7 @@ def values(self): [4, 2], [5, 1]]) >>> type(midx.values) - + """ return self.to_frame(index=False).values @@ -1587,13 +1587,13 @@ def get_loc(self, key, method=None, tolerance=None): -------- >>> import cudf >>> mi = cudf.MultiIndex.from_tuples( - [('a', 'd'), ('b', 'e'), ('b', 'f')]) + ... [('a', 'd'), ('b', 'e'), ('b', 'f')]) >>> mi.get_loc('b') slice(1, 3, None) >>> mi.get_loc(('b', 'e')) 1 >>> non_monotonic_non_unique_idx = cudf.MultiIndex.from_tuples( - [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')]) + ... [('c', 'd'), ('b', 'e'), ('a', 'f'), ('b', 'e')]) >>> non_monotonic_non_unique_idx.get_loc('b') # differ from pandas slice(1, 4, 2) @@ -1609,10 +1609,10 @@ def get_loc(self, key, method=None, tolerance=None): >>> import pandas as pd >>> import cudf - >>> x = pd.MultiIndex.from_tuples( - [(2, 1, 1), (1, 2, 3), (1, 2, 1), - (1, 1, 1), (1, 1, 1), (2, 2, 1)] - ) + >>> x = pd.MultiIndex.from_tuples([ + ... (2, 1, 1), (1, 2, 3), (1, 2, 1), + ... (1, 1, 1), (1, 1, 1), (2, 2, 1), + ... ]) >>> x.get_loc(1) array([False, True, True, True, True, False]) >>> cudf.from_pandas(x).get_loc(1) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 1733a6c0b9a..68113cfdca9 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -891,7 +891,7 @@ def pivot(data, index=None, columns=None, values=None): Examples -------- >>> a = cudf.DataFrame() - >>> a['a'] = [1, 1, 2, 2], + >>> a['a'] = [1, 1, 2, 2] >>> a['b'] = ['a', 'b', 'a', 'b'] >>> a['c'] = [1, 2, 3, 4] >>> a.pivot(index='a', columns='b') @@ -973,6 +973,7 @@ def unstack(df, level, fill_value=None): Examples -------- + >>> df = cudf.DataFrame() >>> df['a'] = [1, 1, 1, 2, 2] >>> df['b'] = [1, 2, 3, 1, 2] >>> df['c'] = [5, 6, 7, 8, 9] diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 787b28e213c..37bb8e32c5a 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -32,7 +32,7 @@ class Scalar(object): >>> cudf.Scalar(42, dtype='int64') + np.int8(21) Scalar(63, dtype=int64) >>> x = cudf.Scalar(42, dtype='datetime64[s]') - >>> y = cudf.Scalar(21, dtype='timedelta64[ns]) + >>> y = cudf.Scalar(21, dtype='timedelta64[ns]') >>> x - y Scalar(1970-01-01T00:00:41.999999979, dtype=datetime64[ns]) >>> cudf.Series([1,2,3]) + cudf.Scalar(1) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 11166320760..6842a05a505 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -512,13 +512,26 @@ def from_pandas(cls, s, nan_as_null=None): @property def dt(self): """ - Accessor object for datetimelike properties of the Series values. + Accessor object for datetime-like properties of the Series values. Examples -------- + >>> s = cudf.Series(cudf.date_range( + ... start='2001-02-03 12:00:00', + ... end='2001-02-03 14:00:00', + ... freq='1H')) >>> s.dt.hour + 0 12 + 1 13 + dtype: int16 >>> s.dt.second + 0 0 + 1 0 + dtype: int16 >>> s.dt.day + 0 3 + 1 3 + dtype: int16 Returns ------- @@ -674,10 +687,12 @@ def drop( y 3 2 x 4 y 5 + dtype: int64 >>> s.drop(labels='y', level=1) 0 x 0 1 x 2 2 x 4 + Name: 2, dtype: int64 """ if labels is not None: if index is not None or columns is not None: @@ -1032,7 +1047,7 @@ def memory_usage(self, index=True, deep=False): -------- >>> s = cudf.Series(range(3), index=['a','b','c']) >>> s.memory_usage() - 48 + 43 Not including the index gives the size of the rest of the data, which is necessarily smaller: @@ -1448,10 +1463,11 @@ def _concat(cls, objs, axis=0, index=True): col = concat_columns([o._column for o in objs]) - if isinstance(col, cudf.core.column.Decimal64Column): - col = col._with_type_metadata(objs[0]._column.dtype) - - if isinstance(col, cudf.core.column.StructColumn): + # Reassign precision for decimal cols & type schema for struct cols + if isinstance( + col, + (cudf.core.column.Decimal64Column, cudf.core.column.StructColumn), + ): col = col._with_type_metadata(objs[0].dtype) return cls(data=col, index=index, name=name) @@ -1538,7 +1554,7 @@ def dropna(self, axis=0, inplace=False, how=None): >>> ser 0 1 1 2 - 2 null + 2 dtype: int64 Drop null values from a Series. @@ -1799,7 +1815,7 @@ def data(self): 3 4 dtype: int64 >>> series.data - + >>> series.data.to_host_array() array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8) @@ -1823,14 +1839,7 @@ def as_mask(self): >>> import cudf >>> s = cudf.Series([True, False, True]) >>> s.as_mask() - - >>> s.as_mask().to_host_array() - array([ 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, - 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 181, 164, - 188, 1, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, - 127, 253, 214, 62, 241, 1, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - dtype=uint8) + """ if not is_bool_dtype(self.dtype): raise TypeError( @@ -2804,11 +2813,11 @@ def autocorr(self, lag=1): Examples -------- >>> import cudf - >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05]) + >>> s = cudf.Series([0.25, 0.5, 0.2, -0.05, 0.17]) >>> s.autocorr() - 0.10355263309024071 + 0.1438853844... >>> s.autocorr(lag=2) - -0.9999999999999999 + -0.9647548490... """ return self.corr(self.shift(lag)) @@ -3583,7 +3592,7 @@ def keys(self): dtype: int64 >>> sr.keys() - RangeIndex(start=0, stop=6) + RangeIndex(start=0, stop=6, step=1) >>> sr = cudf.Series(['a', 'b', 'c']) >>> sr 0 a @@ -3591,7 +3600,7 @@ def keys(self): 2 c dtype: object >>> sr.keys() - RangeIndex(start=0, stop=3) + RangeIndex(start=0, stop=3, step=1) >>> sr = cudf.Series([1, 2, 3], index=['a', 'b', 'c']) >>> sr a 1 diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 15426d0173a..62c31691ac1 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -397,10 +397,10 @@ class DateOffset: -------- >>> from cudf import DateOffset >>> ts = cudf.Series([ - "2000-01-01 00:00:00.012345678", - "2000-01-31 00:00:00.012345678", - "2000-02-29 00:00:00.012345678", - ], dtype='datetime64[ns]) + ... "2000-01-01 00:00:00.012345678", + ... "2000-01-31 00:00:00.012345678", + ... "2000-02-29 00:00:00.012345678", + ... ], dtype='datetime64[ns]') >>> ts + DateOffset(months=3) 0 2000-04-01 00:00:00.012345678 1 2000-04-30 00:00:00.012345678 diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index e5b298a8448..6171f20929d 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -746,18 +746,31 @@ def test_index_astype(nelem): np.testing.assert_equal(df.index.to_numpy(), df["a"].to_numpy()) -def test_dataframe_to_string(): - pd.options.display.max_rows = 5 - pd.options.display.max_columns = 8 - # Test basic +def test_dataframe_to_string_with_skipped_rows(): + # Test skipped rows df = cudf.DataFrame( {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} ) - string = str(df) - assert string.splitlines()[-1] == "[6 rows x 2 columns]" + with pd.option_context("display.max_rows", 5): + got = df.to_string() + + expect = textwrap.dedent( + """\ + a b + 0 1 11 + 1 2 12 + .. .. .. + 4 5 15 + 5 6 16 + + [6 rows x 2 columns]""" + ) + assert got == expect + - # Test skipped columns +def test_dataframe_to_string_with_skipped_rows_and_columns(): + # Test skipped rows and skipped columns df = cudf.DataFrame( { "a": [1, 2, 3, 4, 5, 6], @@ -766,11 +779,26 @@ def test_dataframe_to_string(): "d": [11, 12, 13, 14, 15, 16], } ) - string = df.to_string() - assert string.splitlines()[-1] == "[6 rows x 4 columns]" + with pd.option_context("display.max_rows", 5, "display.max_columns", 3): + got = df.to_string() - # Test masked + expect = textwrap.dedent( + """\ + a ... d + 0 1 ... 11 + 1 2 ... 12 + .. .. ... .. + 4 5 ... 15 + 5 6 ... 16 + + [6 rows x 4 columns]""" + ) + assert got == expect + + +def test_dataframe_to_string_with_masked_data(): + # Test masked data df = cudf.DataFrame( {"a": [1, 2, 3, 4, 5, 6], "b": [11, 12, 13, 14, 15, 16]} ) @@ -783,34 +811,33 @@ def test_dataframe_to_string(): assert masked.null_count == 2 df["c"] = masked - # check data + # Check data values = masked.copy() validids = [0, 2, 3, 5] densearray = masked.dropna().to_numpy() np.testing.assert_equal(data[validids], densearray) - # valid position is correct - + # Valid position is correct for i in validids: assert data[i] == values[i] - # null position is correct + # Null position is correct for i in range(len(values)): if i not in validids: assert values[i] is cudf.NA - pd.options.display.max_rows = 10 - got = df.to_string() + with pd.option_context("display.max_rows", 10): + got = df.to_string() - expect = """ -a b c -0 1 11 0 -1 2 12 -2 3 13 2 -3 4 14 3 -4 5 15 -5 6 16 5 -""" - # values should match despite whitespace difference - assert got.split() == expect.split() + expect = textwrap.dedent( + """\ + a b c + 0 1 11 0 + 1 2 12 + 2 3 13 2 + 3 4 14 3 + 4 5 15 + 5 6 16 5""" + ) + assert got == expect def test_dataframe_to_string_wide(monkeypatch): @@ -9030,3 +9057,14 @@ def test_dataframe_add_suffix(): expected = pdf.add_suffix("_item") assert_eq(got, expected) + + +def test_dataframe_assign_cp_np_array(): + m, n = 5, 3 + cp_ndarray = cupy.random.randn(m, n) + pdf = pd.DataFrame({f"f_{i}": range(m) for i in range(n)}) + gdf = cudf.DataFrame({f"f_{i}": range(m) for i in range(n)}) + pdf[[f"f_{i}" for i in range(n)]] = cupy.asnumpy(cp_ndarray) + gdf[[f"f_{i}" for i in range(n)]] = cp_ndarray + + assert_eq(pdf, gdf) diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py new file mode 100644 index 00000000000..05d6886c297 --- /dev/null +++ b/python/cudf/cudf/tests/test_doctests.py @@ -0,0 +1,102 @@ +import contextlib +import doctest +import inspect +import io +import os + +import numpy as np +import pytest + +import cudf + + +def _name_in_all(parent, name): + return name in getattr(parent, "__all__", []) + + +def _is_public_name(parent, name): + return not name.startswith("_") + + +def _find_doctests_in_obj(obj, finder=None, criteria=None): + """Find all doctests in an object. + + Parameters + ---------- + obj : module or class + The object to search for docstring examples. + finder : doctest.DocTestFinder, optional + The DocTestFinder object to use. If not provided, a DocTestFinder is + constructed. + criteria : callable, optional + Callable indicating whether to recurse over members of the provided + object. If not provided, names not defined in the object's ``__all__`` + property are ignored. + + Yields + ------ + doctest.DocTest + The next doctest found in the object. + """ + if finder is None: + finder = doctest.DocTestFinder() + if criteria is None: + criteria = _name_in_all + for docstring in finder.find(obj): + if docstring.examples: + yield docstring + for name, member in inspect.getmembers(obj): + # Only recurse over members matching the criteria + if not criteria(obj, name): + continue + # Recurse over the public API of modules (objects defined in the + # module's __all__) + if inspect.ismodule(member): + yield from _find_doctests_in_obj( + member, finder, criteria=_name_in_all + ) + # Recurse over the public API of classes (attributes not prefixed with + # an underscore) + if inspect.isclass(member): + yield from _find_doctests_in_obj( + member, finder, criteria=_is_public_name + ) + + +class TestDoctests: + @pytest.fixture(autouse=True) + def chdir_to_tmp_path(cls, tmp_path): + # Some doctests generate files, so this fixture runs the tests in a + # temporary directory. + original_directory = os.getcwd() + os.chdir(tmp_path) + yield + os.chdir(original_directory) + + @pytest.mark.parametrize( + "docstring", + _find_doctests_in_obj(cudf), + ids=lambda docstring: docstring.name, + ) + def test_docstring(self, docstring): + # We ignore differences in whitespace in the doctest output, and enable + # the use of an ellipsis "..." to match any string in the doctest + # output. An ellipsis is useful for, e.g., memory addresses or + # imprecise floating point values. + optionflags = doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE + runner = doctest.DocTestRunner(optionflags=optionflags) + + # These global names are pre-defined and can be used in doctests + # without first importing them. + globals = dict(cudf=cudf, np=np,) + docstring.globs = globals + + # Capture stdout and include failing outputs in the traceback. + doctest_stdout = io.StringIO() + with contextlib.redirect_stdout(doctest_stdout): + runner.run(docstring) + results = runner.summarize() + assert not results.failed, ( + f"{results.failed} of {results.attempted} doctests failed for " + f"{docstring.name}:\n{doctest_stdout.getvalue()}" + ) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index f8c136b8c2d..82020f30f7c 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1475,3 +1475,33 @@ def test_empty_series_name(): gs = cudf.from_pandas(ps) assert ps.__repr__() == gs.__repr__() + + +def test_repr_struct_after_concat(): + df = cudf.DataFrame( + { + "a": cudf.Series( + [ + {"sa": 2056831253}, + {"sa": -1463792165}, + {"sa": 1735783038}, + {"sa": 103774433}, + {"sa": -1413247520}, + ] + * 13 + ), + "b": cudf.Series( + [ + {"sa": {"ssa": 1140062029}}, + None, + {"sa": {"ssa": 1998862860}}, + {"sa": None}, + {"sa": {"ssa": -395088502}}, + ] + * 13 + ), + } + ) + pdf = df.to_pandas() + + assert df.__repr__() == pdf.__repr__() diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py index 7a4a2673f9b..2fcf996b641 100644 --- a/python/cudf/cudf/utils/docutils.py +++ b/python/cudf/cudf/utils/docutils.py @@ -225,13 +225,13 @@ def wrapper(func): 2 2010-01-01 dtype: datetime64[s] >>> s.describe() - count 3 - mean 2006-09-01 08:00:00.000000000 - min 2000-01-01 00:00:00.000000000 - 25% 2004-12-31 12:00:00.000000000 - 50% 2010-01-01 00:00:00.000000000 - 75% 2010-01-01 00:00:00.000000000 - max 2010-01-01 00:00:00.000000000 + count 3 + mean 2006-09-01 08:00:00 + min 2000-01-01 00:00:00 + 25% 2004-12-31 12:00:00 + 50% 2010-01-01 00:00:00 + 75% 2010-01-01 00:00:00 + max 2010-01-01 00:00:00 dtype: object Describing a ``DataFrame``. By default only numeric fields are diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index b881f9372bc..6f958860dad 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -972,9 +972,9 @@ >>> import cudf >>> filename = 'foo.csv' >>> df = cudf.DataFrame({'x': [0, 1, 2, 3], - 'y': [1.0, 3.3, 2.2, 4.4], - 'z': ['a', 'b', 'c', 'd']}) ->>> df = df.set_index([3, 2, 1, 0]) +... 'y': [1.0, 3.3, 2.2, 4.4], +... 'z': ['a', 'b', 'c', 'd']}) +>>> df = df.set_index(cudf.Series([3, 2, 1, 0])) >>> df.to_csv(filename) """