From 6c79b5902d55bab599731a9bded7e89b9c4875c5 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 18 Apr 2022 18:20:38 -0500 Subject: [PATCH] Standardize usage of collections.abc. (#10679) The codebase currently uses multiple ways of importing `collections.abc` classes in cudf. This can be problematic because the names in `collections.abc` can overlap with names in `typing`, so we need a way to disambiguate the two. This PR standardizes our imports such that abstract base classes follow a pattern so that `abc.` is always in the name of abstract base classes. ```python from collections import abc # Not "import collections.abc" or "from collections.abc import Mapping" if isinstance(obj, abc.Mapping): pass ``` Authors: - Bradley Dice (https://github.com/bdice) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/10679 --- python/cudf/cudf/_fuzz_testing/json.py | 2 +- python/cudf/cudf/_lib/csv.pyx | 8 +++---- python/cudf/cudf/_lib/json.pyx | 10 ++++----- python/cudf/cudf/api/types.py | 4 ++-- python/cudf/cudf/comm/gpuarrow.py | 5 ++--- python/cudf/cudf/core/column/categorical.py | 6 +++-- python/cudf/cudf/core/column_accessor.py | 8 +++---- python/cudf/cudf/core/cut.py | 6 ++--- python/cudf/cudf/core/dataframe.py | 25 +++++++++++---------- python/cudf/cudf/core/df_protocol.py | 6 ++--- python/cudf/cudf/core/groupby/groupby.py | 6 ++--- python/cudf/cudf/core/join/_join_helpers.py | 4 ++-- python/cudf/cudf/core/multiindex.py | 6 ++--- python/cudf/cudf/core/reshape.py | 9 ++++---- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/testing/_utils.py | 10 ++++----- 16 files changed, 60 insertions(+), 57 deletions(-) diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py index f850a7e79f9..29e0aeb7050 100644 --- a/python/cudf/cudf/_fuzz_testing/json.py +++ b/python/cudf/cudf/_fuzz_testing/json.py @@ -2,7 +2,7 @@ import logging import random -from collections import abc as abc +from collections import abc import numpy as np diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 05ff32392fe..f1a75baa951 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.map cimport map @@ -19,9 +19,9 @@ import cudf from cudf._lib.cpp.types cimport size_type -import collections.abc as abc import errno import os +from collections import abc from enum import IntEnum from io import BytesIO, StringIO @@ -238,7 +238,7 @@ cdef csv_reader_options make_csv_reader_options( "`parse_dates`: dictionaries are unsupported") if not isinstance(parse_dates, abc.Iterable): raise NotImplementedError( - "`parse_dates`: non-lists are unsupported") + "`parse_dates`: an iterable is required") for col in parse_dates: if isinstance(col, str): c_parse_dates_names.push_back(str(col).encode()) @@ -279,7 +279,7 @@ cdef csv_reader_options make_csv_reader_options( ) csv_reader_options_c.set_dtypes(c_dtypes_list) csv_reader_options_c.set_parse_hex(c_hex_col_indexes) - elif isinstance(dtype, abc.Iterable): + elif isinstance(dtype, abc.Collection): c_dtypes_list.reserve(len(dtype)) for index, col_dtype in enumerate(dtype): if col_dtype in CSV_HEX_TYPE_MAP: diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 48da83450d7..263d70afe26 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -1,11 +1,11 @@ -# Copyright (c) 2019-2020, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # cython: boundscheck = False -import collections.abc as abc import io import os +from collections import abc import cudf @@ -82,15 +82,15 @@ cpdef read_json(object filepaths_or_buffers, for k, v in dtype.items(): c_dtypes_map[str(k).encode()] = \ _get_cudf_data_type_from_dtype(v) - elif not isinstance(dtype, abc.Iterable): - raise TypeError("`dtype` must be 'list like' or 'dict'") - else: + elif isinstance(dtype, abc.Collection): is_list_like_dtypes = True c_dtypes_list.reserve(len(dtype)) for col_dtype in dtype: c_dtypes_list.push_back( _get_cudf_data_type_from_dtype( col_dtype)) + else: + raise TypeError("`dtype` must be 'list like' or 'dict'") cdef json_reader_options opts = move( json_reader_options.builder(make_source_info(filepaths_or_buffers)) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index fad2a973681..56b453dae95 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -4,7 +4,7 @@ from __future__ import annotations -from collections.abc import Sequence +from collections import abc from functools import wraps from inspect import isclass from typing import List, Union @@ -174,7 +174,7 @@ def is_list_like(obj): bool Return True if given object is list-like. """ - return isinstance(obj, (Sequence, np.ndarray)) and not isinstance( + return isinstance(obj, (abc.Sequence, np.ndarray)) and not isinstance( obj, (str, bytes) ) diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py index f21eb4e4d8c..09b4cc5ffba 100644 --- a/python/cudf/cudf/comm/gpuarrow.py +++ b/python/cudf/cudf/comm/gpuarrow.py @@ -1,6 +1,5 @@ # Copyright (c) 2019-2022, NVIDIA CORPORATION. -from collections import OrderedDict -from collections.abc import Sequence +from collections import OrderedDict, abc import numpy as np import pandas as pd @@ -32,7 +31,7 @@ def __init__(self, source, schema=None): self._open(source, schema) -class GpuArrowReader(Sequence): +class GpuArrowReader(abc.Sequence): def __init__(self, schema, dev_ary): self._table = CudaRecordBatchStreamReader(dev_ary, schema).read_all() diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 911391ef984..7c33b9f81fe 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -3,7 +3,7 @@ from __future__ import annotations import pickle -from collections.abc import MutableSequence +from collections import abc from functools import cached_property from typing import ( TYPE_CHECKING, @@ -1379,7 +1379,9 @@ def view(self, dtype: Dtype) -> ColumnBase: ) @staticmethod - def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn: + def _concat( + objs: abc.MutableSequence[CategoricalColumn], + ) -> CategoricalColumn: # TODO: This function currently assumes it is being called from # column.concat_columns, at least to the extent that all the # preprocessing in that function has already been done. That should be diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 291e50386cc..24a2958ce97 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -3,7 +3,7 @@ from __future__ import annotations import itertools -from collections.abc import MutableMapping +from collections import abc from functools import cached_property, reduce from typing import ( TYPE_CHECKING, @@ -78,7 +78,7 @@ def _to_flat_dict(d): return {k: v for k, v in _to_flat_dict_inner(d)} -class ColumnAccessor(MutableMapping): +class ColumnAccessor(abc.MutableMapping): """ Parameters ---------- @@ -99,7 +99,7 @@ class ColumnAccessor(MutableMapping): def __init__( self, - data: Union[MutableMapping, ColumnAccessor] = None, + data: Union[abc.MutableMapping, ColumnAccessor] = None, multiindex: bool = False, level_names=None, ): @@ -213,7 +213,7 @@ def columns(self) -> Tuple[ColumnBase, ...]: return tuple(self.values()) @cached_property - def _grouped_data(self) -> MutableMapping: + def _grouped_data(self) -> abc.MutableMapping: """ If self.multiindex is True, return the underlying mapping as a nested mapping. diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index 0fef6630248..2ec39043eb2 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -1,6 +1,6 @@ # Copyright (c) 2021-2022, NVIDIA CORPORATION. -from collections.abc import Sequence +from collections import abc import cupy import numpy as np @@ -140,7 +140,7 @@ def cut( ) # bins can either be an int, sequence of scalars or an intervalIndex - if isinstance(bins, Sequence): + if isinstance(bins, abc.Sequence): if len(set(bins)) is not len(bins): if duplicates == "raise": raise ValueError( @@ -158,7 +158,7 @@ def cut( # create bins if given an int or single scalar if not isinstance(bins, pd.IntervalIndex): - if not isinstance(bins, (Sequence)): + if not isinstance(bins, (abc.Sequence)): if isinstance( x, (pd.Series, cudf.Series, np.ndarray, cupy.ndarray) ): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 24aa0d01b3c..4ffacfa2ccc 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -9,8 +9,7 @@ import pickle import sys import warnings -from collections import defaultdict -from collections.abc import Iterable, Mapping, Sequence +from collections import abc, defaultdict from typing import ( Any, Callable, @@ -1857,7 +1856,7 @@ def _make_operands_and_index_for_binop( Optional[BaseIndex], ]: # Check built-in types first for speed. - if isinstance(other, (list, dict, Sequence, Mapping)): + if isinstance(other, (list, dict, abc.Sequence, abc.Mapping)): warnings.warn( "Binary operations between host objects such as " f"{type(other)} and cudf.DataFrame are deprecated and will be " @@ -1878,7 +1877,7 @@ def _make_operands_and_index_for_binop( if _is_scalar_or_zero_d_array(other): rhs = {name: other for name in self._data} - elif isinstance(other, (list, Sequence)): + elif isinstance(other, (list, abc.Sequence)): rhs = {name: o for (name, o) in zip(self._data, other)} elif isinstance(other, Series): rhs = dict(zip(other.index.values_host, other.values_host)) @@ -1907,7 +1906,7 @@ def _make_operands_and_index_for_binop( # the fill value. left_default = fill_value - if not isinstance(rhs, (dict, Mapping)): + if not isinstance(rhs, (dict, abc.Mapping)): return NotImplemented, None operands = { @@ -2961,7 +2960,9 @@ def agg(self, aggs, axis=None): if axis == 0 or axis is not None: raise NotImplementedError("axis not implemented yet") - if isinstance(aggs, Iterable) and not isinstance(aggs, (str, dict)): + if isinstance(aggs, abc.Iterable) and not isinstance( + aggs, (str, dict) + ): result = DataFrame() # TODO : Allow simultaneous pass for multi-aggregation as # a future optimization @@ -2997,13 +2998,13 @@ def agg(self, aggs, axis=None): f"'Series' object" ) result[key] = getattr(col, value)() - elif all([isinstance(val, Iterable) for val in aggs.values()]): + elif all([isinstance(val, abc.Iterable) for val in aggs.values()]): idxs = set() for val in aggs.values(): - if isinstance(val, Iterable): - idxs.update(val) - elif isinstance(val, str): + if isinstance(val, str): idxs.add(val) + elif isinstance(val, abc.Iterable): + idxs.update(val) idxs = sorted(list(idxs)) for agg in idxs: if agg is callable: @@ -3017,7 +3018,7 @@ def agg(self, aggs, axis=None): len(idxs), dtype=col.dtype, masked=True ) ans = cudf.Series(data=col_empty, index=idxs) - if isinstance(aggs.get(key), Iterable): + if isinstance(aggs.get(key), abc.Iterable): # TODO : Allow simultaneous pass for multi-aggregation # as a future optimization for agg in aggs.get(key): @@ -6157,7 +6158,7 @@ def _sample_axis_1( def _from_columns_like_self( self, columns: List[ColumnBase], - column_names: Iterable[str], + column_names: abc.Iterable[str], index_names: Optional[List[str]] = None, ) -> DataFrame: result = super()._from_columns_like_self( diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 4a30a78bf65..f4ce658bff3 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -1,7 +1,7 @@ # Copyright (c) 2021-2022, NVIDIA CORPORATION. -import collections import enum +from collections import abc from typing import ( Any, Dict, @@ -569,13 +569,13 @@ def get_columns(self) -> Iterable[_CuDFColumn]: ] def select_columns(self, indices: Sequence[int]) -> "_CuDFDataFrame": - if not isinstance(indices, collections.abc.Sequence): + if not isinstance(indices, abc.Sequence): raise ValueError("`indices` is not a sequence") return _CuDFDataFrame(self._df.iloc[:, indices]) def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame": - if not isinstance(names, collections.Sequence): + if not isinstance(names, abc.Sequence): raise ValueError("`names` is not a sequence") return _CuDFDataFrame( diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 40f8eda0e4f..76b0217df3b 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,9 +1,9 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. -import collections import itertools import pickle import warnings +from collections import abc from functools import cached_property from typing import Any, Iterable, List, Tuple, Union @@ -1638,7 +1638,7 @@ def _handle_by_or_level(self, by=None, level=None): self._handle_series(by) elif isinstance(by, cudf.BaseIndex): self._handle_index(by) - elif isinstance(by, collections.abc.Mapping): + elif isinstance(by, abc.Mapping): self._handle_mapping(by) elif isinstance(by, Grouper): self._handle_grouper(by) @@ -1757,7 +1757,7 @@ def _is_multi_agg(aggs): Returns True if more than one aggregation is performed on any of the columns as specified in `aggs`. """ - if isinstance(aggs, collections.abc.Mapping): + if isinstance(aggs, abc.Mapping): return any(is_list_like(agg) for agg in aggs.values()) if is_list_like(aggs): return True diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index ead0cd566d9..e1057c3b997 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -2,8 +2,8 @@ from __future__ import annotations -import collections import warnings +from collections import abc from typing import TYPE_CHECKING, Any, Tuple, cast import numpy as np @@ -166,7 +166,7 @@ def _match_categorical_dtypes_both( def _coerce_to_tuple(obj): - if isinstance(obj, collections.abc.Iterable) and not isinstance(obj, str): + if isinstance(obj, abc.Iterable) and not isinstance(obj, str): return tuple(obj) else: return (obj,) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 591ec582a3b..9b0b922a7d3 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -5,7 +5,7 @@ import itertools import numbers import pickle -from collections.abc import Sequence +from collections import abc from functools import cached_property from numbers import Integral from typing import Any, List, MutableMapping, Tuple, Union @@ -95,7 +95,7 @@ def __init__( if len(levels) == 0: raise ValueError("Must pass non-zero number of levels/codes") if not isinstance(codes, cudf.DataFrame) and not isinstance( - codes[0], (Sequence, np.ndarray) + codes[0], (abc.Sequence, np.ndarray) ): raise TypeError("Codes is not a Sequence of sequences") @@ -912,7 +912,7 @@ def deserialize(cls, header, frames): def __getitem__(self, index): flatten = isinstance(index, int) - if isinstance(index, (Integral, Sequence)): + if isinstance(index, (Integral, abc.Sequence)): index = np.array(index) elif isinstance(index, slice): start, stop, step = index.indices(len(self)) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index a388e2560ee..f58c93aa0dc 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,6 +1,7 @@ # Copyright (c) 2018-2022, NVIDIA CORPORATION. import itertools +from collections import abc from typing import Dict, Optional import numpy as np @@ -485,14 +486,14 @@ def melt( 1 b B 3 2 c B 5 """ - assert col_level in (None,) + if col_level is not None: + raise NotImplementedError("col_level != None is not supported yet.") # Arg cleaning - import collections # id_vars if id_vars is not None: - if not isinstance(id_vars, collections.abc.Sequence): + if not isinstance(id_vars, abc.Sequence): id_vars = [id_vars] id_vars = list(id_vars) missing = set(id_vars) - set(frame._column_names) @@ -506,7 +507,7 @@ def melt( # value_vars if value_vars is not None: - if not isinstance(value_vars, collections.abc.Sequence): + if not isinstance(value_vars, abc.Sequence): value_vars = [value_vars] value_vars = list(value_vars) missing = set(value_vars) - set(frame._column_names) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 20ba52afccd..f780b5e3895 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -6,7 +6,7 @@ import inspect import pickle import warnings -from collections import abc as abc +from collections import abc from shutil import get_terminal_size from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Type, Union diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 4dd9f434097..fbae7850e60 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -3,7 +3,7 @@ import itertools import re import warnings -from collections.abc import Mapping, Sequence +from collections import abc from contextlib import contextmanager from decimal import Decimal @@ -238,9 +238,9 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs): else: if len(func_args_and_kwargs) == 1: func_args, func_kwargs = [], {} - if isinstance(func_args_and_kwargs[0], Sequence): + if isinstance(func_args_and_kwargs[0], abc.Sequence): func_args = func_args_and_kwargs[0] - elif isinstance(func_args_and_kwargs[0], Mapping): + elif isinstance(func_args_and_kwargs[0], abc.Mapping): func_kwargs = func_args_and_kwargs[0] else: raise ValueError( @@ -248,12 +248,12 @@ def _get_args_kwars_for_assert_exceptions(func_args_and_kwargs): "either a Sequence or a Mapping" ) elif len(func_args_and_kwargs) == 2: - if not isinstance(func_args_and_kwargs[0], Sequence): + if not isinstance(func_args_and_kwargs[0], abc.Sequence): raise ValueError( "Positional argument at 1st position of " "func_args_and_kwargs should be a sequence." ) - if not isinstance(func_args_and_kwargs[1], Mapping): + if not isinstance(func_args_and_kwargs[1], abc.Mapping): raise ValueError( "Key-word argument at 2nd position of " "func_args_and_kwargs should be a dictionary mapping."