diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 92c10df3e4e97..ae4d881571607 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -163,10 +163,7 @@ jobs: pytest pandas/tests/resample/ pytest pandas/tests/reshape/merge pytest pandas/tests/series/ - - # indexing subset (temporary since other tests don't pass yet) - pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_astype_assignment_with_dups - pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_frame_setitem_multi_column + pytest pandas/tests/indexing/ pytest pandas/tests/api/ pytest pandas/tests/apply/ diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index d0a53ec80ce1a..db28ad710989d 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -342,6 +342,8 @@ def length_of_indexer(indexer, target=None) -> int: # GH#25774 return indexer.sum() return len(indexer) + elif isinstance(indexer, range): + return (indexer.stop - indexer.start) // indexer.step elif not is_list_like_indexer(indexer): return 1 raise AssertionError("cannot find the length of the indexer") diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 2177839eb34ce..ef9981f40efe1 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -228,6 +228,11 @@ def _verify_integrity(self) -> None: "Passed arrays should be np.ndarray or ExtensionArray instances, " f"got {type(arr)} instead" ) + if not arr.ndim == 1: + raise ValueError( + "Passed arrays should be 1-dimensional, got array with " + f"{arr.ndim} dimensions instead." + ) def reduce( self: T, func: Callable, ignore_failures: bool = False @@ -1040,6 +1045,9 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: else np.asanyarray(indexer, dtype="int64") ) + if not indexer.ndim == 1: + raise ValueError("indexer should be 1-dimensional") + n = self.shape_proper[axis] indexer = maybe_convert_indices(indexer, n, verify=verify) diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index f71b39d53d825..1db354a7f30b5 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, MultiIndex, @@ -32,6 +34,7 @@ def test_detect_chained_assignment(): zed["eyes"]["right"].fillna(value=555, inplace=True) +@td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view def test_cache_updating(): # 5216 # make sure that we don't try to set a dead cache diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index c203d986efd23..932295c28c8cf 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Float64Index, @@ -114,6 +116,9 @@ def test_getitem_partial_column_select(self): with pytest.raises(KeyError, match=r"\('a', 'foo'\)"): df.loc[("a", "foo"), :] + # TODO(ArrayManager) rewrite test to not use .values + # exp.loc[2000, 4].values[:] select multiple columns -> .values is not a view + @td.skip_array_manager_invalid_test def test_partial_set(self, multiindex_year_month_day_dataframe_random_data): # GH #397 ymd = multiindex_year_month_day_dataframe_random_data diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 9e85f9f65a3bc..5d0aeba4aebbc 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -119,6 +121,9 @@ def test_setitem_multiindex3(self): expected=copy, ) + # TODO(ArrayManager) df.loc["bar"] *= 2 doesn't raise an error but results in + # all NaNs -> doesn't work in the "split" path (also for BlockManager actually) + @td.skip_array_manager_not_yet_implemented def test_multiindex_setitem(self): # GH 3738 @@ -457,6 +462,8 @@ def test_setitem_new_column_all_na(self): assert df["new"].isna().all() +@td.skip_array_manager_invalid_test # df["foo"] select multiple columns -> .values +# is not a view def test_frame_setitem_view_direct(multiindex_dataframe_random_data): # this works because we are modifying the underlying array # really a no-no diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 49181f0fdee7e..f450625629c71 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -3,6 +3,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -169,7 +171,7 @@ def test_detect_chained_assignment(self): tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment_raises(self): + def test_detect_chained_assignment_raises(self, using_array_manager): # test with the chaining df = DataFrame( @@ -180,13 +182,23 @@ def test_detect_chained_assignment_raises(self): ) assert df._is_copy is None - with pytest.raises(com.SettingWithCopyError, match=msg): - df["A"][0] = -5 + if not using_array_manager: + with pytest.raises(com.SettingWithCopyError, match=msg): + df["A"][0] = -5 - with pytest.raises(com.SettingWithCopyError, match=msg): - df["A"][1] = np.nan + with pytest.raises(com.SettingWithCopyError, match=msg): + df["A"][1] = np.nan + + assert df["A"]._is_copy is None - assert df["A"]._is_copy is None + else: + # INFO(ArrayManager) for ArrayManager it doesn't matter that it's + # a mixed dataframe + df["A"][0] = -5 + df["A"][1] = -6 + expected = DataFrame([[-5, 2], [-6, 3]], columns=list("AB")) + expected["B"] = expected["B"].astype("float64") + tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow def test_detect_chained_assignment_fails(self): @@ -219,18 +231,24 @@ def test_detect_chained_assignment_doc_example(self): df[indexer]["c"] = 42 @pytest.mark.arm_slow - def test_detect_chained_assignment_object_dtype(self): + def test_detect_chained_assignment_object_dtype(self, using_array_manager): expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) - with pytest.raises(com.SettingWithCopyError, match=msg): - df["A"][0] = 111 - with pytest.raises(com.SettingWithCopyError, match=msg): df.loc[0]["A"] = 111 - df.loc[0, "A"] = 111 + if not using_array_manager: + with pytest.raises(com.SettingWithCopyError, match=msg): + df["A"][0] = 111 + + df.loc[0, "A"] = 111 + else: + # INFO(ArrayManager) for ArrayManager it doesn't matter that it's + # a mixed dataframe + df["A"][0] = 111 + tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow @@ -347,7 +365,7 @@ def test_detect_chained_assignment_undefined_column(self): df.iloc[0:5]["group"] = "a" @pytest.mark.arm_slow - def test_detect_chained_assignment_changing_dtype(self): + def test_detect_chained_assignment_changing_dtype(self, using_array_manager): # Mixed type setting but same dtype & changing dtype df = DataFrame( @@ -365,8 +383,14 @@ def test_detect_chained_assignment_changing_dtype(self): with pytest.raises(com.SettingWithCopyError, match=msg): df.loc[2]["C"] = "foo" - with pytest.raises(com.SettingWithCopyError, match=msg): + if not using_array_manager: + with pytest.raises(com.SettingWithCopyError, match=msg): + df["C"][2] = "foo" + else: + # INFO(ArrayManager) for ArrayManager it doesn't matter if it's + # changing the dtype or not df["C"][2] = "foo" + assert df.loc[2, "C"] == "foo" def test_setting_with_copy_bug(self): @@ -411,6 +435,8 @@ def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self): ) tm.assert_frame_equal(df, expected) + # TODO(ArrayManager) fast_xs with array-like scalars is not yet working + @td.skip_array_manager_not_yet_implemented def test_chained_getitem_with_lists(self): # GH6394 diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 6a0a6ed18e2e1..e2a063a7697d9 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -10,6 +10,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( Categorical, CategoricalDtype, @@ -63,12 +65,13 @@ class TestiLocBaseIndependent: ], ) @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc]) - def test_iloc_setitem_fullcol_categorical(self, indexer, key): + def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manager): frame = DataFrame({0: range(3)}, dtype=object) cat = Categorical(["alpha", "beta", "gamma"]) - assert frame._mgr.blocks[0]._can_hold_element(cat) + if not using_array_manager: + assert frame._mgr.blocks[0]._can_hold_element(cat) df = frame.copy() orig_vals = df.values @@ -76,13 +79,16 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key): overwrite = isinstance(key, slice) and key == slice(None) - if overwrite: + if overwrite or using_array_manager: + # TODO(ArrayManager) we always overwrite because ArrayManager takes + # the "split" path, which still overwrites # TODO: GH#39986 this probably shouldn't behave differently expected = DataFrame({0: cat}) assert not np.shares_memory(df.values, orig_vals) else: expected = DataFrame({0: cat}).astype(object) - assert np.shares_memory(df.values, orig_vals) + if not using_array_manager: + assert np.shares_memory(df[0].values, orig_vals) tm.assert_frame_equal(df, expected) @@ -93,13 +99,27 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key): else: assert cat[0] != "gamma" + # TODO with mixed dataframe ("split" path), we always overwrite the column + frame = DataFrame({0: np.array([0, 1, 2], dtype=object), 1: range(3)}) + df = frame.copy() + orig_vals = df.values + indexer(df)[key, 0] = cat + expected = DataFrame({0: cat, 1: range(3)}) + tm.assert_frame_equal(df, expected) + + # TODO(ArrayManager) does not yet update parent + @td.skip_array_manager_not_yet_implemented @pytest.mark.parametrize("box", [array, Series]) - def test_iloc_setitem_ea_inplace(self, frame_or_series, box): + def test_iloc_setitem_ea_inplace(self, frame_or_series, box, using_array_manager): # GH#38952 Case with not setting a full column # IntegerArray without NAs arr = array([1, 2, 3, 4]) obj = frame_or_series(arr.to_numpy("i8")) - values = obj.values + + if frame_or_series is Series or not using_array_manager: + values = obj.values + else: + values = obj[0].values obj.iloc[:2] = box(arr[2:]) expected = frame_or_series(np.array([3, 4, 3, 4], dtype="i8")) @@ -109,7 +129,10 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, box): if frame_or_series is Series: assert obj.values is values else: - assert obj.values.base is values.base and values.base is not None + if using_array_manager: + assert obj[0].values is values + else: + assert obj.values.base is values.base and values.base is not None def test_is_scalar_access(self): # GH#32085 index with duplicates doesn't matter for _is_scalar_access @@ -481,13 +504,16 @@ def test_iloc_setitem_dups(self): df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True) tm.assert_frame_equal(df, expected) - def test_iloc_setitem_frame_duplicate_columns_multiple_blocks(self): + def test_iloc_setitem_frame_duplicate_columns_multiple_blocks( + self, using_array_manager + ): # Same as the "assign back to self" check in test_iloc_setitem_dups # but on a DataFrame with multiple blocks df = DataFrame([[0, 1], [2, 3]], columns=["B", "B"]) df.iloc[:, 0] = df.iloc[:, 0].astype("f8") - assert len(df._mgr.blocks) == 2 + if not using_array_manager: + assert len(df._mgr.blocks) == 2 expected = df.copy() # assign back to self @@ -577,7 +603,7 @@ def test_iloc_getitem_labelled_frame(self): with pytest.raises(ValueError, match=msg): df.iloc["j", "D"] - def test_iloc_getitem_doc_issue(self): + def test_iloc_getitem_doc_issue(self, using_array_manager): # multi axis slicing issue with single block # surfaced in GH 6059 @@ -612,7 +638,8 @@ def test_iloc_getitem_doc_issue(self): columns = list(range(0, 8, 2)) df = DataFrame(arr, index=index, columns=columns) - df._mgr.blocks[0].mgr_locs + if not using_array_manager: + df._mgr.blocks[0].mgr_locs result = df.iloc[1:5, 2:4] str(result) result.dtypes @@ -793,7 +820,7 @@ def test_iloc_empty_list_indexer_is_ok(self): df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self): + def test_identity_slice_returns_new_object(self, using_array_manager): # GH13873 original_df = DataFrame({"a": [1, 2, 3]}) sliced_df = original_df.iloc[:] @@ -801,7 +828,12 @@ def test_identity_slice_returns_new_object(self): # should be a shallow copy original_df["a"] = [4, 4, 4] - assert (sliced_df["a"] == 4).all() + if using_array_manager: + # TODO(ArrayManager) verify it is expected that the original didn't change + # setitem is replacing full column, so doesn't update "viewing" dataframe + assert not (sliced_df["a"] == 4).all() + else: + assert (sliced_df["a"] == 4).all() original_series = Series([1, 2, 3, 4, 5, 6]) sliced_series = original_series.iloc[:] @@ -932,6 +964,9 @@ def test_iloc_getitem_readonly_key(self): expected = df["data"].loc[[1, 3, 6]] tm.assert_series_equal(result, expected) + # TODO(ArrayManager) setting single item with an iterable doesn't work yet + # in the "split" path + @td.skip_array_manager_not_yet_implemented def test_iloc_assign_series_to_df_cell(self): # GH 37593 df = DataFrame(columns=["a"], index=[0]) @@ -1088,6 +1123,8 @@ def test_iloc_getitem_setitem_fancy_exceptions(self, float_frame): # GH#32257 we let numpy do validation, get their exception float_frame.iloc[:, :, :] = 1 + # TODO(ArrayManager) "split" path doesn't properly implement DataFrame indexer + @td.skip_array_manager_not_yet_implemented def test_iloc_frame_indexer(self): # GH#39004 df = DataFrame({"a": [1, 2, 3]}) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index b72a7c1081d0e..df688d6745096 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -7,6 +7,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, @@ -67,7 +69,9 @@ def test_setitem_ndarray_1d_2(self): with pytest.raises(ValueError, match=msg): df[2:5] = np.arange(1, 4) * 1j - def test_getitem_ndarray_3d(self, index, frame_or_series, indexer_sli): + def test_getitem_ndarray_3d( + self, index, frame_or_series, indexer_sli, using_array_manager + ): # GH 25567 obj = gen_obj(frame_or_series, index) idxr = indexer_sli(obj) @@ -76,8 +80,12 @@ def test_getitem_ndarray_3d(self, index, frame_or_series, indexer_sli): msgs = [] if frame_or_series is Series and indexer_sli in [tm.setitem, tm.iloc]: msgs.append(r"Wrong number of dimensions. values.ndim > ndim \[3 > 1\]") + if using_array_manager: + msgs.append("Passed array should be 1-dimensional") if frame_or_series is Series or indexer_sli is tm.iloc: msgs.append(r"Buffer has wrong number of dimensions \(expected 1, got 3\)") + if using_array_manager: + msgs.append("indexer should be 1-dimensional") if indexer_sli is tm.loc or ( frame_or_series is Series and indexer_sli is tm.setitem ): @@ -92,8 +100,7 @@ def test_getitem_ndarray_3d(self, index, frame_or_series, indexer_sli): potential_errors = (IndexError, ValueError, NotImplementedError) with pytest.raises(potential_errors, match=msg): - with tm.assert_produces_warning(DeprecationWarning): - idxr[nd3] + idxr[nd3] def test_setitem_ndarray_3d(self, index, frame_or_series, indexer_sli): # GH 25567 @@ -481,6 +488,9 @@ def test_multi_assign_broadcasting_rhs(self): df.loc[df["A"] == 0, ["A", "B"]] = df["D"] tm.assert_frame_equal(df, expected) + # TODO(ArrayManager) setting single item with an iterable doesn't work yet + # in the "split" path + @td.skip_array_manager_not_yet_implemented def test_setitem_list(self): # GH 6043 diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 6c143b3e91d42..85accac5a8235 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -587,7 +587,7 @@ def test_loc_modify_datetime(self): tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_reindex(self): + def test_loc_setitem_frame_with_reindex(self, using_array_manager): # GH#6254 setting issue df = DataFrame(index=[3, 5, 4], columns=["A"], dtype=float) df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") @@ -595,9 +595,30 @@ def test_loc_setitem_frame_with_reindex(self): # setting integer values into a float dataframe with loc is inplace, # so we retain float dtype ser = Series([2, 3, 1], index=[3, 5, 4], dtype=float) + if using_array_manager: + # TODO(ArrayManager) with "split" path, we still overwrite the column + # and therefore don't take the order of the indexer into account + ser = Series([1, 2, 3], index=[3, 5, 4], dtype="int64") expected = DataFrame({"A": ser}) tm.assert_frame_equal(df, expected) + @pytest.mark.xfail(reason="split path wrong update - GH40480") + def test_loc_setitem_frame_with_reindex_mixed(self): + # same test as above, but with mixed dataframe + # TODO with "split" path we still actually overwrite the column + # and therefore don't take the order of the indexer into account + # -> this is a bug: https://github.com/pandas-dev/pandas/issues/40480 + df = DataFrame(index=[3, 5, 4], columns=["A", "B"], dtype=float) + df["B"] = "string" + df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") + ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") + expected = DataFrame({"A": ser}) + expected["B"] = "string" + tm.assert_frame_equal(df, expected) + + # TODO(ArrayManager) "split" path overwrites column and therefore don't take + # the order of the indexer into account + @td.skip_array_manager_not_yet_implemented def test_loc_setitem_empty_frame(self): # GH#6252 setting with an empty frame keys1 = ["@" + str(i) for i in range(5)] @@ -930,7 +951,7 @@ def test_loc_empty_list_indexer_is_ok(self): df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self): + def test_identity_slice_returns_new_object(self, using_array_manager): # GH13873 original_df = DataFrame({"a": [1, 2, 3]}) sliced_df = original_df.loc[:] @@ -939,7 +960,12 @@ def test_identity_slice_returns_new_object(self): # should be a shallow copy original_df["a"] = [4, 4, 4] - assert (sliced_df["a"] == 4).all() + if using_array_manager: + # TODO(ArrayManager) verify it is expected that the original didn't change + # setitem is replacing full column, so doesn't update "viewing" dataframe + assert not (sliced_df["a"] == 4).all() + else: + assert (sliced_df["a"] == 4).all() # These should not return copies assert original_df is original_df.loc[:, :] @@ -1017,6 +1043,9 @@ def test_loc_setitem_empty_append_single_value(self): df.loc[0, "x"] = expected.loc[0, "x"] tm.assert_frame_equal(df, expected) + # TODO(ArrayManager) "split" path doesn't handle this case and gives wrong + # error message + @td.skip_array_manager_not_yet_implemented def test_loc_setitem_empty_append_raises(self): # GH6173, various appends to an empty dataframe @@ -1239,7 +1268,7 @@ def test_loc_setitem_datetimeindex_tz(self, idxer, tz_naive_fixture): result.loc[:, idxer] = expected tm.assert_frame_equal(result, expected) - def test_loc_setitem_time_key(self): + def test_loc_setitem_time_key(self, using_array_manager): index = date_range("2012-01-01", "2012-01-05", freq="30min") df = DataFrame(np.random.randn(len(index), 5), index=index) akey = time(12, 0, 0) @@ -1252,6 +1281,9 @@ def test_loc_setitem_time_key(self): result = result.loc[akey] expected = df.loc[akey].copy() expected.loc[:] = 0 + if using_array_manager: + # TODO(ArrayManager) we are still overwriting columns + expected = expected.astype(float) tm.assert_frame_equal(result, expected) result = df.copy() @@ -1264,6 +1296,9 @@ def test_loc_setitem_time_key(self): result = result.loc[bkey] expected = df.loc[bkey].copy() expected.loc[:] = 0 + if using_array_manager: + # TODO(ArrayManager) we are still overwriting columns + expected = expected.astype(float) tm.assert_frame_equal(result, expected) result = df.copy() @@ -2119,6 +2154,7 @@ def test_loc_setitem_mask_td64_series_value(self): assert expected == result tm.assert_frame_equal(df, df_copy) + @td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values def test_loc_setitem_boolean_and_column(self, float_frame): expected = float_frame.copy() mask = float_frame["A"] > 0 diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 468e4cad742df..b0d41a89931e9 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -7,6 +7,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -146,6 +148,10 @@ def test_partial_setting(self): df.at[dates[-1] + dates.freq, 0] = 7 tm.assert_frame_equal(df, expected) + # TODO(ArrayManager) + # df.loc[0] = Series(1, index=range(4)) case creates float columns + # instead of object dtype + @td.skip_array_manager_not_yet_implemented def test_partial_setting_mixed_dtype(self): # in a mixed dtype environment, try to preserve dtypes