-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Proof of concept for Copy-on-Write implementation #41878
Changes from all commits
462526e
41ee2b7
7a8dffc
1c964be
96b6d71
17cedb9
f4614c2
81b09c2
7f183de
693bc4f
a154591
71370c4
4e785d7
6741340
c4527e9
b33aaf2
9fdad69
f5da03f
c632757
e4a5f33
5efad50
8ea48cc
79b7a30
cf5c7e2
297662a
a011a06
cc09001
37e7ce0
e34b9b2
64377e0
0f28095
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3967,8 +3967,15 @@ def _set_value( | |
""" | ||
try: | ||
if takeable: | ||
series = self._ixs(col, axis=1) | ||
series._set_value(index, value, takeable=True) | ||
if isinstance(self._mgr, ArrayManager): | ||
# with CoW, we can't use intermediate series | ||
# with takeable=True, we know that index is positional and | ||
# not a generic hashable label | ||
index = cast(int, index) | ||
self._mgr.column_setitem(col, index, value) | ||
else: | ||
series = self._ixs(col, axis=1) | ||
series._set_value(index, value, takeable=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we go down this path, we should make a BlockManager.column_setitem There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would have to look into, but this might not be that straightforward, since the current path also deals with updating the cache etc |
||
return | ||
|
||
series = self._get_item_cache(col) | ||
|
@@ -4900,7 +4907,7 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): | |
"labels", | ||
[ | ||
("method", None), | ||
("copy", True), | ||
("copy", None), | ||
("level", None), | ||
("fill_value", np.nan), | ||
("limit", None), | ||
|
@@ -5084,7 +5091,7 @@ def rename( | |
index: Renamer | None = None, | ||
columns: Renamer | None = None, | ||
axis: Axis | None = None, | ||
copy: bool = True, | ||
copy: bool | None = None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
inplace: bool = False, | ||
level: Level | None = None, | ||
errors: str = "ignore", | ||
|
@@ -5900,7 +5907,7 @@ class max type | |
if inplace: | ||
new_obj = self | ||
else: | ||
new_obj = self.copy() | ||
new_obj = self.copy(deep=None) | ||
|
||
new_index = default_index(len(new_obj)) | ||
if level is not None: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -996,7 +996,7 @@ def rename( | |
index: Renamer | None = None, | ||
columns: Renamer | None = None, | ||
axis: Axis | None = None, | ||
copy: bool_t = True, | ||
copy: bool_t | None = None, | ||
inplace: bool_t = False, | ||
level: Level | None = None, | ||
errors: str = "ignore", | ||
|
@@ -3952,6 +3952,8 @@ def _check_setitem_copy(self, t="setting", force=False): | |
df.iloc[0:5]['group'] = 'a' | ||
|
||
""" | ||
if isinstance(self._mgr, (ArrayManager, SingleArrayManager)): | ||
return | ||
# return early if the check is not needed | ||
if not (force or self._is_copy): | ||
return | ||
|
@@ -4906,7 +4908,7 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT: | |
axes, kwargs = self._construct_axes_from_arguments(args, kwargs) | ||
method = missing.clean_reindex_fill_method(kwargs.pop("method", None)) | ||
level = kwargs.pop("level", None) | ||
copy = kwargs.pop("copy", True) | ||
copy = kwargs.pop("copy", None) | ||
limit = kwargs.pop("limit", None) | ||
tolerance = kwargs.pop("tolerance", None) | ||
fill_value = kwargs.pop("fill_value", None) | ||
|
@@ -4931,9 +4933,7 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT: | |
for axis, ax in axes.items() | ||
if ax is not None | ||
): | ||
if copy: | ||
return self.copy() | ||
return self | ||
return self.copy(deep=copy) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. with copy=False we no longer get a new object. i generally like this, but its an API change There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, good point. I shouldn't change this in this PR (or at least not for BlockManager-based dataframe). Now for the CoW I need to pass down the copy=None, so that will unfortunately require a check for AM vs BM then. |
||
|
||
# check if we are a multi reindex | ||
if self._needs_reindex_multi(axes, method, level): | ||
|
@@ -5895,7 +5895,7 @@ def astype( | |
return cast(NDFrameT, result) | ||
|
||
@final | ||
def copy(self: NDFrameT, deep: bool_t = True) -> NDFrameT: | ||
def copy(self: NDFrameT, deep: bool_t | None = True) -> NDFrameT: | ||
""" | ||
Make a copy of this object's indices and data. | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1840,6 +1840,17 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): | |
""" | ||
pi = plane_indexer | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #42887 would make for a good precursor |
||
if not hasattr(self.obj._mgr, "blocks"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. shouldn't this be an ArrayManager test? why the different semantics? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is just a check to know if it is an ArrayManager, without actually importing it (core/indexing.py currently doesn't import anything from internals). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yah we should for sure have 1 canonical way of doing this check so we can grep for all the places we do it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. -> #44676 |
||
# ArrayManager: in this case we cannot rely on getting the column | ||
# as a Series to mutate, but need to operated on the mgr directly | ||
if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. xref #44353 (doesn't need to be resolved for this PR, but will make lots of things easier) |
||
arr = self.obj._sanitize_column(value) | ||
self.obj._mgr.iset(loc, arr) | ||
else: | ||
self.obj._mgr.column_setitem(loc, plane_indexer, value) | ||
self.obj._clear_item_cache() | ||
return | ||
|
||
ser = self.obj._ixs(loc, axis=1) | ||
|
||
# perform the equivalent of a setitem on the info axis | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you not do this in internals? hate leaking ArrayManager semantics here
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What I added here is actually to call into the internals to do it, instead of the current frame-level methods.
It's only possible for ArrayManager, though (see https://github.com/pandas-dev/pandas/pull/41878/files#r757450838 just below), which is the reason I added a check.