diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b8524a302f4c9..8fd9bc3424ed5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -86,11 +86,10 @@ repos: types: [python] exclude: ^pandas/_typing\.py$ - id: inconsistent-namespace-usage - name: 'Check for inconsistent use of pandas namespace in tests' + name: 'Check for inconsistent use of pandas namespace' entry: python scripts/check_for_inconsistent_pandas_namespace.py language: python types: [python] - files: ^pandas/tests/ - id: incorrect-code-directives name: Check for incorrect code block or IPython directives language: pygrep diff --git a/asv_bench/benchmarks/arithmetic.py b/asv_bench/benchmarks/arithmetic.py index 488237a6f5a8b..bfb1be8705495 100644 --- a/asv_bench/benchmarks/arithmetic.py +++ b/asv_bench/benchmarks/arithmetic.py @@ -140,9 +140,7 @@ def setup(self, op, shape): # construct dataframe with 2 blocks arr1 = np.random.randn(n_rows, n_cols // 2).astype("f8") arr2 = np.random.randn(n_rows, n_cols // 2).astype("f4") - df = pd.concat( - [pd.DataFrame(arr1), pd.DataFrame(arr2)], axis=1, ignore_index=True - ) + df = pd.concat([DataFrame(arr1), DataFrame(arr2)], axis=1, ignore_index=True) # should already be the case, but just to be sure df._consolidate_inplace() @@ -151,7 +149,7 @@ def setup(self, op, shape): arr2 = np.random.randn(n_rows, n_cols // 2).astype("i8") arr3 = np.random.randn(n_rows, n_cols // 4).astype("f8") df2 = pd.concat( - [pd.DataFrame(arr1), pd.DataFrame(arr2), pd.DataFrame(arr3)], + [DataFrame(arr1), DataFrame(arr2), DataFrame(arr3)], axis=1, ignore_index=True, ) @@ -459,9 +457,9 @@ class OffsetArrayArithmetic: def setup(self, offset): N = 10000 - rng = pd.date_range(start="1/1/2000", periods=N, freq="T") + rng = date_range(start="1/1/2000", periods=N, freq="T") self.rng = rng - self.ser = pd.Series(rng) + self.ser = Series(rng) def time_add_series_offset(self, offset): with warnings.catch_warnings(record=True): @@ -478,7 +476,7 @@ class ApplyIndex: def setup(self, offset): N = 10000 - rng = pd.date_range(start="1/1/2000", periods=N, freq="T") + rng = date_range(start="1/1/2000", periods=N, freq="T") self.rng = rng def time_apply_index(self, offset): @@ -490,17 +488,17 @@ class BinaryOpsMultiIndex: param_names = ["func"] def setup(self, func): - date_range = pd.date_range("20200101 00:00", "20200102 0:00", freq="S") + array = date_range("20200101 00:00", "20200102 0:00", freq="S") level_0_names = [str(i) for i in range(30)] - index = pd.MultiIndex.from_product([level_0_names, date_range]) + index = pd.MultiIndex.from_product([level_0_names, array]) column_names = ["col_1", "col_2"] - self.df = pd.DataFrame( + self.df = DataFrame( np.random.rand(len(index), 2), index=index, columns=column_names ) - self.arg_df = pd.DataFrame( + self.arg_df = DataFrame( np.random.randint(1, 10, (len(level_0_names), 2)), index=level_0_names, columns=column_names, diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 5006a0dbf1f98..35e5818cd3b2b 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -28,7 +28,7 @@ def setup(self): data = np.random.randn(N)[:-i] idx = rng[:-i] data[100:] = np.nan - self.series[i] = pd.Series(pd.SparseArray(data), index=idx) + self.series[i] = Series(SparseArray(data), index=idx) def time_series_to_frame(self): pd.DataFrame(self.series) @@ -63,7 +63,7 @@ def setup(self): ) def time_sparse_series_from_coo(self): - pd.Series.sparse.from_coo(self.matrix) + Series.sparse.from_coo(self.matrix) class ToCoo: diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index adea9f6c19996..9bacb30b78a64 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -207,12 +207,12 @@ def box_expected(expected, box_cls, transpose=True): """ if box_cls is pd.array: expected = pd.array(expected) - elif box_cls is pd.Index: - expected = pd.Index(expected) - elif box_cls is pd.Series: - expected = pd.Series(expected) - elif box_cls is pd.DataFrame: - expected = pd.Series(expected).to_frame() + elif box_cls is Index: + expected = Index(expected) + elif box_cls is Series: + expected = Series(expected) + elif box_cls is DataFrame: + expected = Series(expected).to_frame() if transpose: # for vector operations, we need a DataFrame to be a single-row, # not a single-column, in order to operate against non-DataFrame @@ -400,7 +400,7 @@ def _make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None) "x": state.rand(n) * 2 - 1, "y": state.rand(n) * 2 - 1, } - df = pd.DataFrame(columns, index=index, columns=sorted(columns)) + df = DataFrame(columns, index=index, columns=sorted(columns)) if df.index[-1] == end: df = df.iloc[:-1] return df diff --git a/pandas/conftest.py b/pandas/conftest.py index aa43746d0e7d5..3fdde3261bd68 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -328,7 +328,7 @@ def unique_nulls_fixture(request): # ---------------------------------------------------------------- -@pytest.fixture(params=[pd.DataFrame, pd.Series]) +@pytest.fixture(params=[DataFrame, Series]) def frame_or_series(request): """ Fixture to parametrize over DataFrame and Series. @@ -338,7 +338,7 @@ def frame_or_series(request): # error: List item 0 has incompatible type "Type[Index]"; expected "Type[IndexOpsMixin]" @pytest.fixture( - params=[pd.Index, pd.Series], ids=["index", "series"] # type: ignore[list-item] + params=[Index, Series], ids=["index", "series"] # type: ignore[list-item] ) def index_or_series(request): """ @@ -356,9 +356,7 @@ def index_or_series(request): index_or_series2 = index_or_series -@pytest.fixture( - params=[pd.Index, pd.Series, pd.array], ids=["index", "series", "array"] -) +@pytest.fixture(params=[Index, Series, pd.array], ids=["index", "series", "array"]) def index_or_series_or_array(request): """ Fixture to parametrize over Index, Series, and ExtensionArray @@ -559,7 +557,7 @@ def index_with_missing(request): # ---------------------------------------------------------------- @pytest.fixture def empty_series(): - return pd.Series([], index=[], dtype=np.float64) + return Series([], index=[], dtype=np.float64) @pytest.fixture @@ -596,7 +594,7 @@ def _create_series(index): """ Helper for the _series dict """ size = len(index) data = np.random.randn(size) - return pd.Series(data, index=index, name="a") + return Series(data, index=index, name="a") _series = { @@ -1437,7 +1435,7 @@ def any_numpy_dtype(request): ("boolean", [True, np.nan, False]), ("boolean", [True, pd.NA, False]), ("datetime64", [np.datetime64("2013-01-01"), np.nan, np.datetime64("2018-01-01")]), - ("datetime", [pd.Timestamp("20130101"), np.nan, pd.Timestamp("20180101")]), + ("datetime", [Timestamp("20130101"), np.nan, Timestamp("20180101")]), ("date", [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), # The following two dtypes are commented out due to GH 23554 # ('complex', [1 + 1j, np.nan, 2 + 2j]), @@ -1445,8 +1443,8 @@ def any_numpy_dtype(request): # np.nan, np.timedelta64(2, 'D')]), ("timedelta", [timedelta(1), np.nan, timedelta(2)]), ("time", [time(1), np.nan, time(2)]), - ("period", [pd.Period(2013), pd.NaT, pd.Period(2018)]), - ("interval", [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)]), + ("period", [Period(2013), pd.NaT, Period(2018)]), + ("interval", [Interval(0, 1), np.nan, Interval(0, 2)]), ] ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 5550da7421e00..a9c1de8a382ea 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -186,8 +186,8 @@ def __init__( if not data.index.is_unique or not data.columns.is_unique: raise ValueError("style is not supported for non-unique indices.") self.data: DataFrame = data - self.index: pd.Index = data.index - self.columns: pd.Index = data.columns + self.index: Index = data.index + self.columns: Index = data.columns self.table_styles = table_styles if not isinstance(uuid_len, int) or not uuid_len >= 0: raise TypeError("``uuid_len`` must be an integer in range [0, 32].") @@ -913,7 +913,7 @@ def _apply( result.columns = data.columns else: result = func(data, **kwargs) - if not isinstance(result, pd.DataFrame): + if not isinstance(result, DataFrame): if not isinstance(result, np.ndarray): raise TypeError( f"Function {repr(func)} must return a DataFrame or ndarray " @@ -1565,7 +1565,7 @@ def css(rgba) -> str: if s.ndim == 1: return [css(rgba) for rgba in rgbas] else: - return pd.DataFrame( + return DataFrame( [[css(rgba) for rgba in row] for row in rgbas], index=s.index, columns=s.columns, @@ -1655,7 +1655,7 @@ def css(x): if s.ndim == 1: return [css(x) for x in normed] else: - return pd.DataFrame( + return DataFrame( [[css(x) for x in row] for row in normed], index=s.index, columns=s.columns, diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 1a7e2d1d820f7..62d368264752b 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -1372,9 +1372,9 @@ def array_likes(request): data = memoryview(arr) elif name == "array": # stdlib array - from array import array as array_stdlib + import array - data = array_stdlib("i", arr) + data = array.array("i", arr) elif name == "dask": import dask.array diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 869255505eb74..ca68885fdc470 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -1236,14 +1236,14 @@ def __len__(self, n): def test_constructor_stdlib_array(self): # GH 4297 # support Array - from array import array as stdlib_array + import array - result = DataFrame({"A": stdlib_array("i", range(10))}) + result = DataFrame({"A": array.array("i", range(10))}) expected = DataFrame({"A": list(range(10))}) tm.assert_frame_equal(result, expected, check_dtype=False) expected = DataFrame([list(range(10)), list(range(10))]) - result = DataFrame([stdlib_array("i", range(10)), stdlib_array("i", range(10))]) + result = DataFrame([array.array("i", range(10)), array.array("i", range(10))]) tm.assert_frame_equal(result, expected, check_dtype=False) def test_constructor_range(self): diff --git a/scripts/check_for_inconsistent_pandas_namespace.py b/scripts/check_for_inconsistent_pandas_namespace.py index c84a92324f976..3c21821e794a9 100644 --- a/scripts/check_for_inconsistent_pandas_namespace.py +++ b/scripts/check_for_inconsistent_pandas_namespace.py @@ -2,7 +2,7 @@ Check that test suite file doesn't use the pandas namespace inconsistently. We check for cases of ``Series`` and ``pd.Series`` appearing in the same file -(likewise for some other common classes). +(likewise for other pandas objects). This is meant to be run as a pre-commit hook - to run it manually, you can do: @@ -15,43 +15,50 @@ though note that you may need to manually fixup some imports and that you will also need the additional dependency `tokenize-rt` (which is left out from the pre-commit hook so that it uses the same virtualenv as the other local ones). + +The general structure is similar to that of some plugins from +https://github.com/asottile/pyupgrade . """ import argparse import ast +import sys from typing import ( MutableMapping, + NamedTuple, Optional, Sequence, Set, - Tuple, ) -ERROR_MESSAGE = "Found both `pd.{name}` and `{name}` in {path}" -EXCLUDE = { - "eval", # built-in, different from `pd.eval` - "np", # pd.np is deprecated but still tested -} -Offset = Tuple[int, int] +ERROR_MESSAGE = ( + "{path}:{lineno}:{col_offset}: " + "Found both '{prefix}.{name}' and '{name}' in {path}" +) + + +class OffsetWithNamespace(NamedTuple): + lineno: int + col_offset: int + namespace: str class Visitor(ast.NodeVisitor): def __init__(self) -> None: - self.pandas_namespace: MutableMapping[Offset, str] = {} - self.no_namespace: Set[str] = set() + self.pandas_namespace: MutableMapping[OffsetWithNamespace, str] = {} + self.imported_from_pandas: Set[str] = set() def visit_Attribute(self, node: ast.Attribute) -> None: - if ( - isinstance(node.value, ast.Name) - and node.value.id == "pd" - and node.attr not in EXCLUDE - ): - self.pandas_namespace[(node.lineno, node.col_offset)] = node.attr + if isinstance(node.value, ast.Name) and node.value.id in {"pandas", "pd"}: + offset_with_namespace = OffsetWithNamespace( + node.lineno, node.col_offset, node.value.id + ) + self.pandas_namespace[offset_with_namespace] = node.attr self.generic_visit(node) - def visit_Name(self, node: ast.Name) -> None: - if node.id not in EXCLUDE: - self.no_namespace.add(node.id) + def visit_ImportFrom(self, node: ast.ImportFrom) -> None: + if node.module is not None and "pandas" in node.module: + self.imported_from_pandas.update(name.name for name in node.names) self.generic_visit(node) @@ -64,9 +71,11 @@ def replace_inconsistent_pandas_namespace(visitor: Visitor, content: str) -> str tokens = src_to_tokens(content) for n, i in reversed_enumerate(tokens): + offset_with_namespace = OffsetWithNamespace(i.offset[0], i.offset[1], i.src) if ( - i.offset in visitor.pandas_namespace - and visitor.pandas_namespace[i.offset] in visitor.no_namespace + offset_with_namespace in visitor.pandas_namespace + and visitor.pandas_namespace[offset_with_namespace] + in visitor.imported_from_pandas ): # Replace `pd` tokens[n] = i._replace(src="") @@ -85,16 +94,28 @@ def check_for_inconsistent_pandas_namespace( visitor = Visitor() visitor.visit(tree) - inconsistencies = visitor.no_namespace.intersection( + inconsistencies = visitor.imported_from_pandas.intersection( visitor.pandas_namespace.values() ) + if not inconsistencies: # No inconsistent namespace usage, nothing to replace. - return content + return None if not replace: - msg = ERROR_MESSAGE.format(name=inconsistencies.pop(), path=path) - raise RuntimeError(msg) + inconsistency = inconsistencies.pop() + lineno, col_offset, prefix = next( + key for key, val in visitor.pandas_namespace.items() if val == inconsistency + ) + msg = ERROR_MESSAGE.format( + lineno=lineno, + col_offset=col_offset, + prefix=prefix, + name=inconsistency, + path=path, + ) + sys.stdout.write(msg) + sys.exit(1) return replace_inconsistent_pandas_namespace(visitor, content) diff --git a/scripts/tests/test_inconsistent_namespace_check.py b/scripts/tests/test_inconsistent_namespace_check.py index 9562a30ba0ffd..eb995158d8cb4 100644 --- a/scripts/tests/test_inconsistent_namespace_check.py +++ b/scripts/tests/test_inconsistent_namespace_check.py @@ -4,35 +4,58 @@ check_for_inconsistent_pandas_namespace, ) -BAD_FILE_0 = "cat_0 = Categorical()\ncat_1 = pd.Categorical()" -BAD_FILE_1 = "cat_0 = pd.Categorical()\ncat_1 = Categorical()" -GOOD_FILE_0 = "cat_0 = Categorical()\ncat_1 = Categorical()" +BAD_FILE_0 = ( + "from pandas import Categorical\n" + "cat_0 = Categorical()\n" + "cat_1 = pd.Categorical()" +) +BAD_FILE_1 = ( + "from pandas import Categorical\n" + "cat_0 = pd.Categorical()\n" + "cat_1 = Categorical()" +) +BAD_FILE_2 = ( + "from pandas import Categorical\n" + "cat_0 = pandas.Categorical()\n" + "cat_1 = Categorical()" +) +GOOD_FILE_0 = ( + "from pandas import Categorical\ncat_0 = Categorical()\ncat_1 = Categorical()" +) GOOD_FILE_1 = "cat_0 = pd.Categorical()\ncat_1 = pd.Categorical()" +GOOD_FILE_2 = "from array import array\nimport pandas as pd\narr = pd.array([])" PATH = "t.py" -@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1]) -def test_inconsistent_usage(content): - msg = r"Found both `pd\.Categorical` and `Categorical` in t\.py" - with pytest.raises(RuntimeError, match=msg): +@pytest.mark.parametrize( + "content, expected", + [ + (BAD_FILE_0, "t.py:3:8: Found both 'pd.Categorical' and 'Categorical' in t.py"), + (BAD_FILE_1, "t.py:2:8: Found both 'pd.Categorical' and 'Categorical' in t.py"), + ( + BAD_FILE_2, + "t.py:2:8: Found both 'pandas.Categorical' and 'Categorical' in t.py", + ), + ], +) +def test_inconsistent_usage(content, expected, capsys): + with pytest.raises(SystemExit): check_for_inconsistent_pandas_namespace(content, PATH, replace=False) + result, _ = capsys.readouterr() + assert result == expected -@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1]) -def test_consistent_usage(content): +@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1, GOOD_FILE_2]) +@pytest.mark.parametrize("replace", [True, False]) +def test_consistent_usage(content, replace): # should not raise - check_for_inconsistent_pandas_namespace(content, PATH, replace=False) + check_for_inconsistent_pandas_namespace(content, PATH, replace=replace) -@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1]) +@pytest.mark.parametrize("content", [BAD_FILE_0, BAD_FILE_1, BAD_FILE_2]) def test_inconsistent_usage_with_replace(content): result = check_for_inconsistent_pandas_namespace(content, PATH, replace=True) - expected = "cat_0 = Categorical()\ncat_1 = Categorical()" - assert result == expected - - -@pytest.mark.parametrize("content", [GOOD_FILE_0, GOOD_FILE_1]) -def test_consistent_usage_with_replace(content): - result = check_for_inconsistent_pandas_namespace(content, PATH, replace=True) - expected = content + expected = ( + "from pandas import Categorical\ncat_0 = Categorical()\ncat_1 = Categorical()" + ) assert result == expected