Skip to content

Commit

Permalink
Use generator expressions in any/all functions. (#10736)
Browse files Browse the repository at this point in the history
This PR uses generator expressions in `any(...)` and `all(...)` to avoid allocating a list in memory while maximizing the potential benefit of early exit from the `any`/`all` function.

I also fixed a few miscellaneous things (~ 10 lines):
- Use `cls` in `classmethod`s
- Simplify a lambda expression
- Use `super()` with no arguments if the arguments are the parent class and `self`
- Parenthesize multi-line strings with implicit concatenation to clarify the behavior when written in a tuple of values

Note: Some of these were caught by https://codereview.doctor/rapidsai/cudf. In some places, the bot correctly identified a problem but its suggestions were invalid or incomplete. I identified steps for improvement beyond what the bot suggested for most of these cases.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Michael Wang (https://github.com/isVoid)

URL: #10736
  • Loading branch information
bdice authored Apr 30, 2022
1 parent bf10a94 commit 027c34a
Show file tree
Hide file tree
Showing 11 changed files with 32 additions and 36 deletions.
12 changes: 4 additions & 8 deletions python/cudf/cudf/_fuzz_testing/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import random
from collections import OrderedDict
Expand Down Expand Up @@ -312,13 +312,9 @@ def sanitize(value, struct_type):
return tuple(values_list)

has_nulls_or_nullable_dtype = any(
[
True
if df[col].dtype in pandas_dtypes_to_np_dtypes
or df[col].isnull().any()
else False
for col in df.columns
]
(col := df[colname]).dtype in pandas_dtypes_to_np_dtypes
or col.isnull().any()
for colname in df.columns
)
pdf = df.copy(deep=True)
for field in arrow_table_schema:
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/column/interval.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2021, NVIDIA CORPORATION.
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
import pandas as pd
import pyarrow as pa

Expand Down Expand Up @@ -39,7 +39,7 @@ def closed(self):
return self._closed

@classmethod
def from_arrow(self, data):
def from_arrow(cls, data):
new_col = super().from_arrow(data.storage)
size = len(data)
dtype = IntervalDtype.from_arrow(data.type)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def base_size(self):
return len(self.base_children[0])

@classmethod
def from_arrow(self, data):
def from_arrow(cls, data):
size = len(data)
dtype = cudf.core.dtypes.StructDtype.from_arrow(data.type)

Expand Down
14 changes: 6 additions & 8 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def _can_downcast_to_series(self, df, arg):
):
return True
dtypes = df.dtypes.values.tolist()
all_numeric = all([is_numeric_dtype(t) for t in dtypes])
all_numeric = all(is_numeric_dtype(t) for t in dtypes)
if all_numeric:
return True
if ncols == 1:
Expand Down Expand Up @@ -720,7 +720,7 @@ def _init_from_series_list(self, data, columns, index):

final_index = as_index(index)

series_lengths = list(map(lambda x: len(x), data))
series_lengths = list(map(len, data))
data = numeric_normalize_types(*data)
if series_lengths.count(series_lengths[0]) == len(series_lengths):
# Calculating the final dataframe columns by
Expand Down Expand Up @@ -2999,11 +2999,11 @@ def agg(self, aggs, axis=None):

elif isinstance(aggs, dict):
cols = aggs.keys()
if any([callable(val) for val in aggs.values()]):
if any(callable(val) for val in aggs.values()):
raise NotImplementedError(
"callable parameter is not implemented yet"
)
elif all([isinstance(val, str) for val in aggs.values()]):
elif all(isinstance(val, str) for val in aggs.values()):
result = cudf.Series(index=cols)
for key, value in aggs.items():
col = df_normalized[key]
Expand All @@ -3013,7 +3013,7 @@ def agg(self, aggs, axis=None):
f"'Series' object"
)
result[key] = getattr(col, value)()
elif all([isinstance(val, abc.Iterable) for val in aggs.values()]):
elif all(isinstance(val, abc.Iterable) for val in aggs.values()):
idxs = set()
for val in aggs.values():
if isinstance(val, str):
Expand Down Expand Up @@ -6032,9 +6032,7 @@ def append(
if (cols.get_indexer(other._data.to_pandas_index()) >= 0).all():
other = other.reindex(columns=cols)

return super(DataFrame, self)._append(
other, ignore_index, verify_integrity, sort
)
return super()._append(other, ignore_index, verify_integrity, sort)

@_cudf_nvtx_annotate
@copy_docstring(reshape.pivot)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,7 @@ def _generate_months_column(self, size, op):
def _is_no_op(self) -> bool:
# some logic could be implemented here for more complex cases
# such as +1 year, -12 months
return all([i == 0 for i in self._kwds.values()])
return all(i == 0 for i in self._kwds.values())

def __neg__(self):
new_scalars = {k: -v for k, v in self._kwds.items()}
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _write_parquet(
"row_group_size_rows": row_group_size_rows,
"partitions_info": partitions_info,
}
if all([ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs]):
if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
with ExitStack() as stack:
fsspec_objs = [stack.enter_context(file) for file in paths_or_bufs]
file_objs = [
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/testing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def assert_eq(left, right, **kwargs):
# Use the overloaded __eq__ of the operands
if left == right:
return True
elif any([np.issubdtype(type(x), np.floating) for x in (left, right)]):
elif any(np.issubdtype(type(x), np.floating) for x in (left, right)):
np.testing.assert_almost_equal(left, right)
else:
np.testing.assert_equal(left, right)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_custom_accessor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import pandas as pd
import pytest
Expand All @@ -17,7 +17,7 @@ def __init__(self, obj):
@staticmethod
def _validate(obj):
cols = obj.columns
if not all([vertex in cols for vertex in ["x", "y"]]):
if not all(vertex in cols for vertex in ["x", "y"]):
raise AttributeError("Must have vertices 'x', 'y'.")

@property
Expand Down
6 changes: 2 additions & 4 deletions python/cudf/cudf/tests/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,8 @@ def assert_column_array_dtype_equal(column: ColumnBase, array: pa.array):
)
elif isinstance(column.dtype, StructDtype):
return array.type.equals(column.dtype.to_arrow()) and all(
[
assert_column_array_dtype_equal(child, array.field(i))
for i, child in enumerate(column.base_children)
]
assert_column_array_dtype_equal(child, array.field(i))
for i, child in enumerate(column.base_children)
)
elif isinstance(
column.dtype, (Decimal128Dtype, Decimal64Dtype, Decimal32Dtype)
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/tests/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -762,7 +762,7 @@ def test_multiindex_copy_deep(data, deep):
lptrs = [child.base_data.ptr for child in lchildren]
rptrs = [child.base_data.ptr for child in rchildren]

assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))

elif isinstance(data, cudf.MultiIndex):
mi1 = data
Expand All @@ -772,19 +772,19 @@ def test_multiindex_copy_deep(data, deep):
lptrs = [lv._data._data[None].base_data.ptr for lv in mi1._levels]
rptrs = [lv._data._data[None].base_data.ptr for lv in mi2._levels]

assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))

# Assert ._codes identity
lptrs = [c.base_data.ptr for _, c in mi1._codes._data.items()]
rptrs = [c.base_data.ptr for _, c in mi2._codes._data.items()]

assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))

# Assert ._data identity
lptrs = [d.base_data.ptr for _, d in mi1._data.items()]
rptrs = [d.base_data.ptr for _, d in mi2._data.items()]

assert all([(x == y) is same_ref for x, y in zip(lptrs, rptrs)])
assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))


@pytest.mark.parametrize(
Expand Down
12 changes: 8 additions & 4 deletions python/cudf/cudf/tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,8 +308,10 @@ def test_character_tokenize_series():
"hello world",
"sdf",
None,
"goodbye, one-two:three~four+five_six@sev"
"en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ",
(
"goodbye, one-two:three~four+five_six@sev"
"en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ"
),
]
)
expected = cudf.Series(
Expand Down Expand Up @@ -423,8 +425,10 @@ def test_character_tokenize_index():
"hello world",
"sdf",
None,
"goodbye, one-two:three~four+five_six@sev"
"en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ",
(
"goodbye, one-two:three~four+five_six@sev"
"en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ"
),
]
)
expected = cudf.core.index.as_index(
Expand Down

0 comments on commit 027c34a

Please sign in to comment.