-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactor cudf.Series.__init__ #14450
Changes from 13 commits
9010bbb
94dee27
055d153
70747ce
30d5e65
6f7adde
a444e1d
8254f43
a3753a8
f0c1ecb
eacc27d
d6c460e
6f4f598
3b31a43
7e9fa74
177b198
64b560e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,7 +57,6 @@ | |
TimeDeltaColumn, | ||
arange, | ||
as_column, | ||
column, | ||
full, | ||
) | ||
from cudf.core.column.categorical import ( | ||
|
@@ -202,7 +201,6 @@ def __getitem__(self, arg): | |
|
||
@_cudf_nvtx_annotate | ||
def __setitem__(self, key, value): | ||
from cudf.core.column import column | ||
|
||
if isinstance(key, tuple): | ||
key = list(key) | ||
|
@@ -264,7 +262,7 @@ def __setitem__(self, key, value): | |
self._frame._column.dtype, (cudf.ListDtype, cudf.StructDtype) | ||
) | ||
): | ||
value = column.as_column(value) | ||
value = as_column(value) | ||
|
||
if ( | ||
( | ||
|
@@ -568,7 +566,7 @@ def from_masked_array(cls, data, mask, null_count=None): | |
4 14 | ||
dtype: int64 | ||
""" | ||
col = column.as_column(data).set_mask(mask) | ||
col = as_column(data).set_mask(mask) | ||
return cls(data=col) | ||
|
||
@_cudf_nvtx_annotate | ||
|
@@ -593,73 +591,31 @@ def __init__( | |
"to silence this warning.", | ||
FutureWarning, | ||
) | ||
if isinstance(data, pd.Series): | ||
if name is None: | ||
name = data.name | ||
if isinstance(data.index, pd.MultiIndex): | ||
index = cudf.from_pandas(data.index) | ||
else: | ||
index = as_index(data.index) | ||
elif isinstance(data, pd.Index): | ||
if name is None: | ||
name = data.name | ||
data = as_column(data, nan_as_null=nan_as_null, dtype=dtype) | ||
elif isinstance(data, BaseIndex): | ||
if name is None: | ||
name = data.name | ||
data = data._values | ||
if dtype is not None: | ||
data = data.astype(dtype) | ||
index_from_data = None | ||
name_from_data = None | ||
if isinstance(data, (pd.Series, pd.Index, BaseIndex, Series)): | ||
if copy: | ||
data = data.copy(deep=True) | ||
name_from_data = data.name | ||
column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) | ||
if isinstance(data, (pd.Series, Series)): | ||
index, index_from_data = as_index(data.index), index | ||
elif isinstance(data, ColumnAccessor): | ||
raise TypeError( | ||
"Use cudf.Series._from_data for constructing a Series from " | ||
"ColumnAccessor" | ||
) | ||
|
||
if isinstance(data, Series): | ||
if index is not None: | ||
data = data.reindex(index) | ||
else: | ||
index = data._index | ||
if name is None: | ||
name = data.name | ||
data = data._column | ||
if copy: | ||
data = data.copy(deep=True) | ||
if dtype is not None: | ||
data = data.astype(dtype) | ||
|
||
if isinstance(data, dict): | ||
elif isinstance(data, dict) or data is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps separating these two conditions as two branches is clearer? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can swap |
||
if not data: | ||
current_index = RangeIndex(0) | ||
else: | ||
current_index = data.keys() | ||
if index is not None: | ||
series = Series( | ||
list(data.values()), | ||
nan_as_null=nan_as_null, | ||
dtype=dtype, | ||
index=current_index, | ||
) | ||
new_index = as_index(index) | ||
if not series.index.equals(new_index): | ||
series = series.reindex(new_index) | ||
data = series._column | ||
index = series._index | ||
data = {} | ||
column = as_column(data, nan_as_null=nan_as_null, dtype=dtype) | ||
index, index_from_data = RangeIndex(0), index | ||
else: | ||
data = column.as_column( | ||
column = as_column( | ||
list(data.values()), nan_as_null=nan_as_null, dtype=dtype | ||
) | ||
index = current_index | ||
if data is None: | ||
if index is not None: | ||
data = column.column_empty( | ||
row_count=len(index), dtype=None, masked=True | ||
) | ||
else: | ||
data = {} | ||
|
||
if not isinstance(data, ColumnBase): | ||
index, index_from_data = as_index(list(data.keys())), index | ||
else: | ||
# Using `getattr_static` to check if | ||
# `data` is on device memory and perform | ||
# a deep copy later. This is different | ||
|
@@ -677,25 +633,34 @@ def __init__( | |
) | ||
is property | ||
) | ||
data = column.as_column( | ||
column = as_column( | ||
data, | ||
nan_as_null=nan_as_null, | ||
dtype=dtype, | ||
length=len(index) if index is not None else None, | ||
) | ||
if copy and has_cai: | ||
data = data.copy(deep=True) | ||
else: | ||
if dtype is not None: | ||
data = data.astype(dtype) | ||
column = column.copy(deep=True) | ||
|
||
if index is not None and not isinstance(index, BaseIndex): | ||
index = as_index(index) | ||
assert isinstance(column, ColumnBase) | ||
|
||
assert isinstance(data, ColumnBase) | ||
if dtype is not None: | ||
column = column.astype(dtype) | ||
Comment on lines
+649
to
+650
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After passing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is mostly defensive for now as I am not sure if |
||
|
||
super().__init__({name: data}) | ||
self._index = RangeIndex(len(data)) if index is None else index | ||
if index is not None: | ||
index = as_index(index) | ||
else: | ||
index = RangeIndex(len(column)) | ||
|
||
if name_from_data is not None and name is None: | ||
name = name_from_data | ||
super().__init__({name: column}, index=index) | ||
if index_from_data is not None: | ||
# TODO: This there a better way to do this? | ||
index_from_data = as_index(index_from_data) | ||
reindexed = self.reindex(index=index_from_data, copy=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There seems to be a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I just tried this and there were some test failures around data types not being preserved during the |
||
self._data = reindexed._data | ||
self._index = index_from_data | ||
self._check_data_index_length_match() | ||
|
||
@classmethod | ||
|
@@ -717,7 +682,7 @@ def __contains__(self, item): | |
|
||
@classmethod | ||
@_cudf_nvtx_annotate | ||
def from_pandas(cls, s, nan_as_null=no_default): | ||
def from_pandas(cls, s: pd.Series, nan_as_null=no_default): | ||
""" | ||
Convert from a Pandas Series. | ||
|
||
|
@@ -760,7 +725,7 @@ def from_pandas(cls, s, nan_as_null=no_default): | |
False if cudf.get_option("mode.pandas_compatible") else None | ||
) | ||
with warnings.catch_warnings(): | ||
warnings.simplefilter("ignore") | ||
warnings.simplefilter("ignore", FutureWarning) | ||
result = cls(s, nan_as_null=nan_as_null) | ||
return result | ||
|
||
|
@@ -5250,16 +5215,16 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): | |
b = b.reindex(a.index) | ||
index = as_index(a.index) | ||
|
||
a_col = column.as_column(a) | ||
a_col = as_column(a) | ||
a_array = cupy.asarray(a_col.data_array_view(mode="read")) | ||
|
||
b_col = column.as_column(b) | ||
b_col = as_column(b) | ||
b_array = cupy.asarray(b_col.data_array_view(mode="read")) | ||
|
||
result = cupy.isclose( | ||
a=a_array, b=b_array, rtol=rtol, atol=atol, equal_nan=equal_nan | ||
) | ||
result_col = column.as_column(result) | ||
result_col = as_column(result) | ||
|
||
if a_col.null_count and b_col.null_count: | ||
a_nulls = a_col.isnull() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The naming is confusing for me on this line.
index
is the passed in argument,index_from_data
is the index extracted from data. However it seems as if the meaning swapped on this line?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I can see how this is confusing, I'll swap the naming here throughout the
__init__
to make it clearer