From 027c34aefbf8a5abf5394da15a7b6f1dcc63b06c Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 29 Apr 2022 19:30:33 -0500 Subject: [PATCH] Use generator expressions in any/all functions. (#10736) This PR uses generator expressions in `any(...)` and `all(...)` to avoid allocating a list in memory while maximizing the potential benefit of early exit from the `any`/`all` function. I also fixed a few miscellaneous things (~ 10 lines): - Use `cls` in `classmethod`s - Simplify a lambda expression - Use `super()` with no arguments if the arguments are the parent class and `self` - Parenthesize multi-line strings with implicit concatenation to clarify the behavior when written in a tuple of values Note: Some of these were caught by https://codereview.doctor/rapidsai/cudf. In some places, the bot correctly identified a problem but its suggestions were invalid or incomplete. I identified steps for improvement beyond what the bot suggested for most of these cases. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Ashwin Srinath (https://github.com/shwina) - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/10736 --- python/cudf/cudf/_fuzz_testing/utils.py | 12 ++++-------- python/cudf/cudf/core/column/interval.py | 4 ++-- python/cudf/cudf/core/column/struct.py | 2 +- python/cudf/cudf/core/dataframe.py | 14 ++++++-------- python/cudf/cudf/core/tools/datetimes.py | 2 +- python/cudf/cudf/io/parquet.py | 2 +- python/cudf/cudf/testing/_utils.py | 2 +- python/cudf/cudf/tests/test_custom_accessor.py | 4 ++-- python/cudf/cudf/tests/test_dtypes.py | 6 ++---- python/cudf/cudf/tests/test_multiindex.py | 8 ++++---- python/cudf/cudf/tests/test_text.py | 12 ++++++++---- 11 files changed, 32 insertions(+), 36 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py index 87a8fc46374..9f3c0ab6d5f 100644 --- a/python/cudf/cudf/_fuzz_testing/utils.py +++ b/python/cudf/cudf/_fuzz_testing/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import random from collections import OrderedDict @@ -312,13 +312,9 @@ def sanitize(value, struct_type): return tuple(values_list) has_nulls_or_nullable_dtype = any( - [ - True - if df[col].dtype in pandas_dtypes_to_np_dtypes - or df[col].isnull().any() - else False - for col in df.columns - ] + (col := df[colname]).dtype in pandas_dtypes_to_np_dtypes + or col.isnull().any() + for colname in df.columns ) pdf = df.copy(deep=True) for field in arrow_table_schema: diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py index a873a0f98a5..bfaf65d45e2 100644 --- a/python/cudf/cudf/core/column/interval.py +++ b/python/cudf/cudf/core/column/interval.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import pandas as pd import pyarrow as pa @@ -39,7 +39,7 @@ def closed(self): return self._closed @classmethod - def from_arrow(self, data): + def from_arrow(cls, data): new_col = super().from_arrow(data.storage) size = len(data) dtype = IntervalDtype.from_arrow(data.type) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 53e6e9972b1..ed5e1c9450d 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -31,7 +31,7 @@ def base_size(self): return len(self.base_children[0]) @classmethod - def from_arrow(self, data): + def from_arrow(cls, data): size = len(data) dtype = cudf.core.dtypes.StructDtype.from_arrow(data.type) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0d3b3ee0300..8c459e855c1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -160,7 +160,7 @@ def _can_downcast_to_series(self, df, arg): ): return True dtypes = df.dtypes.values.tolist() - all_numeric = all([is_numeric_dtype(t) for t in dtypes]) + all_numeric = all(is_numeric_dtype(t) for t in dtypes) if all_numeric: return True if ncols == 1: @@ -720,7 +720,7 @@ def _init_from_series_list(self, data, columns, index): final_index = as_index(index) - series_lengths = list(map(lambda x: len(x), data)) + series_lengths = list(map(len, data)) data = numeric_normalize_types(*data) if series_lengths.count(series_lengths[0]) == len(series_lengths): # Calculating the final dataframe columns by @@ -2999,11 +2999,11 @@ def agg(self, aggs, axis=None): elif isinstance(aggs, dict): cols = aggs.keys() - if any([callable(val) for val in aggs.values()]): + if any(callable(val) for val in aggs.values()): raise NotImplementedError( "callable parameter is not implemented yet" ) - elif all([isinstance(val, str) for val in aggs.values()]): + elif all(isinstance(val, str) for val in aggs.values()): result = cudf.Series(index=cols) for key, value in aggs.items(): col = df_normalized[key] @@ -3013,7 +3013,7 @@ def agg(self, aggs, axis=None): f"'Series' object" ) result[key] = getattr(col, value)() - elif all([isinstance(val, abc.Iterable) for val in aggs.values()]): + elif all(isinstance(val, abc.Iterable) for val in aggs.values()): idxs = set() for val in aggs.values(): if isinstance(val, str): @@ -6032,9 +6032,7 @@ def append( if (cols.get_indexer(other._data.to_pandas_index()) >= 0).all(): other = other.reindex(columns=cols) - return super(DataFrame, self)._append( - other, ignore_index, verify_integrity, sort - ) + return super()._append(other, ignore_index, verify_integrity, sort) @_cudf_nvtx_annotate @copy_docstring(reshape.pivot) diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 3ce89bc27e8..ccd23b82c88 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -629,7 +629,7 @@ def _generate_months_column(self, size, op): def _is_no_op(self) -> bool: # some logic could be implemented here for more complex cases # such as +1 year, -12 months - return all([i == 0 for i in self._kwds.values()]) + return all(i == 0 for i in self._kwds.values()) def __neg__(self): new_scalars = {k: -v for k, v in self._kwds.items()} diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index baedc3f174b..5746bf6fec9 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -56,7 +56,7 @@ def _write_parquet( "row_group_size_rows": row_group_size_rows, "partitions_info": partitions_info, } - if all([ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs]): + if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs): with ExitStack() as stack: fsspec_objs = [stack.enter_context(file) for file in paths_or_bufs] file_objs = [ diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 607d9121630..5232d1adb64 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -133,7 +133,7 @@ def assert_eq(left, right, **kwargs): # Use the overloaded __eq__ of the operands if left == right: return True - elif any([np.issubdtype(type(x), np.floating) for x in (left, right)]): + elif any(np.issubdtype(type(x), np.floating) for x in (left, right)): np.testing.assert_almost_equal(left, right) else: np.testing.assert_equal(left, right) diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py index bfd2ccbccef..35cc107b257 100644 --- a/python/cudf/cudf/tests/test_custom_accessor.py +++ b/python/cudf/cudf/tests/test_custom_accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. import pandas as pd import pytest @@ -17,7 +17,7 @@ def __init__(self, obj): @staticmethod def _validate(obj): cols = obj.columns - if not all([vertex in cols for vertex in ["x", "y"]]): + if not all(vertex in cols for vertex in ["x", "y"]): raise AttributeError("Must have vertices 'x', 'y'.") @property diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py index 356685c976e..f6a0e41a0c7 100644 --- a/python/cudf/cudf/tests/test_dtypes.py +++ b/python/cudf/cudf/tests/test_dtypes.py @@ -189,10 +189,8 @@ def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array): ) elif isinstance(column.dtype, StructDtype): return array.type.equals(column.dtype.to_arrow()) and all( - [ - assert_column_array_dtype_equal(child, array.field(i)) - for i, child in enumerate(column.base_children) - ] + assert_column_array_dtype_equal(child, array.field(i)) + for i, child in enumerate(column.base_children) ) elif isinstance( column.dtype, (Decimal128Dtype, Decimal64Dtype, Decimal32Dtype) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index 38225b3efb9..f3830ed386a 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -762,7 +762,7 @@ def test_multiindex_copy_deep(data, deep): lptrs = [child.base_data.ptr for child in lchildren] rptrs = [child.base_data.ptr for child in rchildren] - assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)]) + assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) elif isinstance(data, cudf.MultiIndex): mi1 = data @@ -772,19 +772,19 @@ def test_multiindex_copy_deep(data, deep): lptrs = [lv._data._data[None].base_data.ptr for lv in mi1._levels] rptrs = [lv._data._data[None].base_data.ptr for lv in mi2._levels] - assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)]) + assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._codes identity lptrs = [c.base_data.ptr for _, c in mi1._codes._data.items()] rptrs = [c.base_data.ptr for _, c in mi2._codes._data.items()] - assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)]) + assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) # Assert ._data identity lptrs = [d.base_data.ptr for _, d in mi1._data.items()] rptrs = [d.base_data.ptr for _, d in mi2._data.items()] - assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)]) + assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs)) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 21c22110910..a4edaeff545 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -308,8 +308,10 @@ def test_character_tokenize_series(): "hello world", "sdf", None, - "goodbye, one-two:three~four+five_six@sev" - "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ", + ( + "goodbye, one-two:three~four+five_six@sev" + "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" + ), ] ) expected = cudf.Series( @@ -423,8 +425,10 @@ def test_character_tokenize_index(): "hello world", "sdf", None, - "goodbye, one-two:three~four+five_six@sev" - "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ", + ( + "goodbye, one-two:three~four+five_six@sev" + "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" + ), ] ) expected = cudf.core.index.as_index(