Skip to content

Commit

Permalink
Standardize usage of collections.abc. (#10679)
Browse files Browse the repository at this point in the history
The codebase currently uses multiple ways of importing `collections.abc` classes in cudf. This can be problematic because the names in `collections.abc` can overlap with names in `typing`, so we need a way to disambiguate the two. This PR standardizes our imports such that abstract base classes follow a pattern so that `abc.` is always in the name of abstract base classes.

```python
from collections import abc
# Not "import collections.abc" or "from collections.abc import Mapping"

if isinstance(obj, abc.Mapping):
    pass
```

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #10679
  • Loading branch information
bdice authored Apr 18, 2022
1 parent c322cba commit 6c79b59
Show file tree
Hide file tree
Showing 16 changed files with 60 additions and 57 deletions.
2 changes: 1 addition & 1 deletion python/cudf/cudf/_fuzz_testing/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging
import random
from collections import abc as abc
from collections import abc

import numpy as np

Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.map cimport map
Expand All @@ -19,9 +19,9 @@ import cudf

from cudf._lib.cpp.types cimport size_type

import collections.abc as abc
import errno
import os
from collections import abc
from enum import IntEnum
from io import BytesIO, StringIO

Expand Down Expand Up @@ -238,7 +238,7 @@ cdef csv_reader_options make_csv_reader_options(
"`parse_dates`: dictionaries are unsupported")
if not isinstance(parse_dates, abc.Iterable):
raise NotImplementedError(
"`parse_dates`: non-lists are unsupported")
"`parse_dates`: an iterable is required")
for col in parse_dates:
if isinstance(col, str):
c_parse_dates_names.push_back(str(col).encode())
Expand Down Expand Up @@ -279,7 +279,7 @@ cdef csv_reader_options make_csv_reader_options(
)
csv_reader_options_c.set_dtypes(c_dtypes_list)
csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
elif isinstance(dtype, abc.Iterable):
elif isinstance(dtype, abc.Collection):
c_dtypes_list.reserve(len(dtype))
for index, col_dtype in enumerate(dtype):
if col_dtype in CSV_HEX_TYPE_MAP:
Expand Down
10 changes: 5 additions & 5 deletions python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Copyright (c) 2019-2020, NVIDIA CORPORATION.
# Copyright (c) 2019-2022, NVIDIA CORPORATION.

# cython: boundscheck = False


import collections.abc as abc
import io
import os
from collections import abc

import cudf

Expand Down Expand Up @@ -82,15 +82,15 @@ cpdef read_json(object filepaths_or_buffers,
for k, v in dtype.items():
c_dtypes_map[str(k).encode()] = \
_get_cudf_data_type_from_dtype(v)
elif not isinstance(dtype, abc.Iterable):
raise TypeError("`dtype` must be 'list like' or 'dict'")
else:
elif isinstance(dtype, abc.Collection):
is_list_like_dtypes = True
c_dtypes_list.reserve(len(dtype))
for col_dtype in dtype:
c_dtypes_list.push_back(
_get_cudf_data_type_from_dtype(
col_dtype))
else:
raise TypeError("`dtype` must be 'list like' or 'dict'")

cdef json_reader_options opts = move(
json_reader_options.builder(make_source_info(filepaths_or_buffers))
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from __future__ import annotations

from collections.abc import Sequence
from collections import abc
from functools import wraps
from inspect import isclass
from typing import List, Union
Expand Down Expand Up @@ -174,7 +174,7 @@ def is_list_like(obj):
bool
Return True if given object is list-like.
"""
return isinstance(obj, (Sequence, np.ndarray)) and not isinstance(
return isinstance(obj, (abc.Sequence, np.ndarray)) and not isinstance(
obj, (str, bytes)
)

Expand Down
5 changes: 2 additions & 3 deletions python/cudf/cudf/comm/gpuarrow.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
from collections import OrderedDict
from collections.abc import Sequence
from collections import OrderedDict, abc

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -32,7 +31,7 @@ def __init__(self, source, schema=None):
self._open(source, schema)


class GpuArrowReader(Sequence):
class GpuArrowReader(abc.Sequence):
def __init__(self, schema, dev_ary):
self._table = CudaRecordBatchStreamReader(dev_ary, schema).read_all()

Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import annotations

import pickle
from collections.abc import MutableSequence
from collections import abc
from functools import cached_property
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -1379,7 +1379,9 @@ def view(self, dtype: Dtype) -> ColumnBase:
)

@staticmethod
def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn:
def _concat(
objs: abc.MutableSequence[CategoricalColumn],
) -> CategoricalColumn:
# TODO: This function currently assumes it is being called from
# column.concat_columns, at least to the extent that all the
# preprocessing in that function has already been done. That should be
Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/column_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from __future__ import annotations

import itertools
from collections.abc import MutableMapping
from collections import abc
from functools import cached_property, reduce
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -78,7 +78,7 @@ def _to_flat_dict(d):
return {k: v for k, v in _to_flat_dict_inner(d)}


class ColumnAccessor(MutableMapping):
class ColumnAccessor(abc.MutableMapping):
"""
Parameters
----------
Expand All @@ -99,7 +99,7 @@ class ColumnAccessor(MutableMapping):

def __init__(
self,
data: Union[MutableMapping, ColumnAccessor] = None,
data: Union[abc.MutableMapping, ColumnAccessor] = None,
multiindex: bool = False,
level_names=None,
):
Expand Down Expand Up @@ -213,7 +213,7 @@ def columns(self) -> Tuple[ColumnBase, ...]:
return tuple(self.values())

@cached_property
def _grouped_data(self) -> MutableMapping:
def _grouped_data(self) -> abc.MutableMapping:
"""
If self.multiindex is True,
return the underlying mapping as a nested mapping.
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/cut.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

from collections.abc import Sequence
from collections import abc

import cupy
import numpy as np
Expand Down Expand Up @@ -140,7 +140,7 @@ def cut(
)

# bins can either be an int, sequence of scalars or an intervalIndex
if isinstance(bins, Sequence):
if isinstance(bins, abc.Sequence):
if len(set(bins)) is not len(bins):
if duplicates == "raise":
raise ValueError(
Expand All @@ -158,7 +158,7 @@ def cut(

# create bins if given an int or single scalar
if not isinstance(bins, pd.IntervalIndex):
if not isinstance(bins, (Sequence)):
if not isinstance(bins, (abc.Sequence)):
if isinstance(
x, (pd.Series, cudf.Series, np.ndarray, cupy.ndarray)
):
Expand Down
25 changes: 13 additions & 12 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
import pickle
import sys
import warnings
from collections import defaultdict
from collections.abc import Iterable, Mapping, Sequence
from collections import abc, defaultdict
from typing import (
Any,
Callable,
Expand Down Expand Up @@ -1857,7 +1856,7 @@ def _make_operands_and_index_for_binop(
Optional[BaseIndex],
]:
# Check built-in types first for speed.
if isinstance(other, (list, dict, Sequence, Mapping)):
if isinstance(other, (list, dict, abc.Sequence, abc.Mapping)):
warnings.warn(
"Binary operations between host objects such as "
f"{type(other)} and cudf.DataFrame are deprecated and will be "
Expand All @@ -1878,7 +1877,7 @@ def _make_operands_and_index_for_binop(

if _is_scalar_or_zero_d_array(other):
rhs = {name: other for name in self._data}
elif isinstance(other, (list, Sequence)):
elif isinstance(other, (list, abc.Sequence)):
rhs = {name: o for (name, o) in zip(self._data, other)}
elif isinstance(other, Series):
rhs = dict(zip(other.index.values_host, other.values_host))
Expand Down Expand Up @@ -1907,7 +1906,7 @@ def _make_operands_and_index_for_binop(
# the fill value.
left_default = fill_value

if not isinstance(rhs, (dict, Mapping)):
if not isinstance(rhs, (dict, abc.Mapping)):
return NotImplemented, None

operands = {
Expand Down Expand Up @@ -2961,7 +2960,9 @@ def agg(self, aggs, axis=None):
if axis == 0 or axis is not None:
raise NotImplementedError("axis not implemented yet")

if isinstance(aggs, Iterable) and not isinstance(aggs, (str, dict)):
if isinstance(aggs, abc.Iterable) and not isinstance(
aggs, (str, dict)
):
result = DataFrame()
# TODO : Allow simultaneous pass for multi-aggregation as
# a future optimization
Expand Down Expand Up @@ -2997,13 +2998,13 @@ def agg(self, aggs, axis=None):
f"'Series' object"
)
result[key] = getattr(col, value)()
elif all([isinstance(val, Iterable) for val in aggs.values()]):
elif all([isinstance(val, abc.Iterable) for val in aggs.values()]):
idxs = set()
for val in aggs.values():
if isinstance(val, Iterable):
idxs.update(val)
elif isinstance(val, str):
if isinstance(val, str):
idxs.add(val)
elif isinstance(val, abc.Iterable):
idxs.update(val)
idxs = sorted(list(idxs))
for agg in idxs:
if agg is callable:
Expand All @@ -3017,7 +3018,7 @@ def agg(self, aggs, axis=None):
len(idxs), dtype=col.dtype, masked=True
)
ans = cudf.Series(data=col_empty, index=idxs)
if isinstance(aggs.get(key), Iterable):
if isinstance(aggs.get(key), abc.Iterable):
# TODO : Allow simultaneous pass for multi-aggregation
# as a future optimization
for agg in aggs.get(key):
Expand Down Expand Up @@ -6157,7 +6158,7 @@ def _sample_axis_1(
def _from_columns_like_self(
self,
columns: List[ColumnBase],
column_names: Iterable[str],
column_names: abc.Iterable[str],
index_names: Optional[List[str]] = None,
) -> DataFrame:
result = super()._from_columns_like_self(
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/df_protocol.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Copyright (c) 2021-2022, NVIDIA CORPORATION.

import collections
import enum
from collections import abc
from typing import (
Any,
Dict,
Expand Down Expand Up @@ -569,13 +569,13 @@ def get_columns(self) -> Iterable[_CuDFColumn]:
]

def select_columns(self, indices: Sequence[int]) -> "_CuDFDataFrame":
if not isinstance(indices, collections.abc.Sequence):
if not isinstance(indices, abc.Sequence):
raise ValueError("`indices` is not a sequence")

return _CuDFDataFrame(self._df.iloc[:, indices])

def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame":
if not isinstance(names, collections.Sequence):
if not isinstance(names, abc.Sequence):
raise ValueError("`names` is not a sequence")

return _CuDFDataFrame(
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import collections
import itertools
import pickle
import warnings
from collections import abc
from functools import cached_property
from typing import Any, Iterable, List, Tuple, Union

Expand Down Expand Up @@ -1638,7 +1638,7 @@ def _handle_by_or_level(self, by=None, level=None):
self._handle_series(by)
elif isinstance(by, cudf.BaseIndex):
self._handle_index(by)
elif isinstance(by, collections.abc.Mapping):
elif isinstance(by, abc.Mapping):
self._handle_mapping(by)
elif isinstance(by, Grouper):
self._handle_grouper(by)
Expand Down Expand Up @@ -1757,7 +1757,7 @@ def _is_multi_agg(aggs):
Returns True if more than one aggregation is performed
on any of the columns as specified in `aggs`.
"""
if isinstance(aggs, collections.abc.Mapping):
if isinstance(aggs, abc.Mapping):
return any(is_list_like(agg) for agg in aggs.values())
if is_list_like(aggs):
return True
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/join/_join_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from __future__ import annotations

import collections
import warnings
from collections import abc
from typing import TYPE_CHECKING, Any, Tuple, cast

import numpy as np
Expand Down Expand Up @@ -166,7 +166,7 @@ def _match_categorical_dtypes_both(


def _coerce_to_tuple(obj):
if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str):
if isinstance(obj, abc.Iterable) and not isinstance(obj, str):
return tuple(obj)
else:
return (obj,)
6 changes: 3 additions & 3 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import itertools
import numbers
import pickle
from collections.abc import Sequence
from collections import abc
from functools import cached_property
from numbers import Integral
from typing import Any, List, MutableMapping, Tuple, Union
Expand Down Expand Up @@ -95,7 +95,7 @@ def __init__(
if len(levels) == 0:
raise ValueError("Must pass non-zero number of levels/codes")
if not isinstance(codes, cudf.DataFrame) and not isinstance(
codes[0], (Sequence, np.ndarray)
codes[0], (abc.Sequence, np.ndarray)
):
raise TypeError("Codes is not a Sequence of sequences")

Expand Down Expand Up @@ -912,7 +912,7 @@ def deserialize(cls, header, frames):
def __getitem__(self, index):
flatten = isinstance(index, int)

if isinstance(index, (Integral, Sequence)):
if isinstance(index, (Integral, abc.Sequence)):
index = np.array(index)
elif isinstance(index, slice):
start, stop, step = index.indices(len(self))
Expand Down
Loading

0 comments on commit 6c79b59

Please sign in to comment.