Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Upgrade pandas to 1.2 #7375

Merged
merged 46 commits into from
Feb 26, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
155f10f
fix issues with updating to latest pandas
galipremsagar Feb 12, 2021
0ec247e
Merge remote-tracking branch 'upstream/branch-0.19' into 7367
galipremsagar Feb 12, 2021
454ecf5
remove xfails and fix issues
galipremsagar Feb 12, 2021
a1a928d
Merge remote-tracking branch 'upstream/branch-0.19' into 7367
galipremsagar Feb 18, 2021
303c77d
fix isin and misc tests
galipremsagar Feb 22, 2021
18d1fb3
remove redundant code
galipremsagar Feb 22, 2021
b727253
Merge remote-tracking branch 'upstream/branch-0.19' into 7367
galipremsagar Feb 22, 2021
01afece
fix more issues
galipremsagar Feb 22, 2021
691d154
Merge remote-tracking branch 'upstream/branch-0.19' into 7367
galipremsagar Feb 23, 2021
c7c47b5
fix lots of deprecated warnings
galipremsagar Feb 23, 2021
d106b79
Merge remote-tracking branch 'upstream/branch-0.19' into 7367
galipremsagar Feb 23, 2021
aea3313
fix multiple warnings
galipremsagar Feb 23, 2021
9fdbfe7
unpin pandas
galipremsagar Feb 23, 2021
27a782b
cleanup
galipremsagar Feb 23, 2021
3cde2ef
cleanup
galipremsagar Feb 23, 2021
9a3b51a
copyright
galipremsagar Feb 23, 2021
2f8fe18
Merge remote-tracking branch 'upstream/branch-0.19' into 7367
galipremsagar Feb 23, 2021
7a534b0
pin pandas upper bound version
galipremsagar Feb 24, 2021
81d9b5d
use only minor version
galipremsagar Feb 24, 2021
14e8c0e
Merge remote-tracking branch 'upstream/branch-0.19' into 7367
galipremsagar Feb 24, 2021
c5b83a2
use functools for finding union
galipremsagar Feb 24, 2021
5e6855d
add utility for creating a pandas series and refactor imports in test…
galipremsagar Feb 24, 2021
ea61733
remove is_scalar check
galipremsagar Feb 24, 2021
d8ca966
version all pytest xfails
galipremsagar Feb 24, 2021
8d079f0
add check_order flag
galipremsagar Feb 24, 2021
d8ff534
remove version for cudf apis
galipremsagar Feb 24, 2021
a0637b9
make importing cudf uniform in pytests
galipremsagar Feb 24, 2021
b63ae03
refactor imports to be uniform and less confusing
galipremsagar Feb 24, 2021
c3c3e68
remove versioning of cudf api call
galipremsagar Feb 24, 2021
992b483
Update python/cudf/cudf/tests/test_setitem.py
galipremsagar Feb 24, 2021
355e192
remove double validation
galipremsagar Feb 24, 2021
3942cf1
Merge branch '7367' of https://github.com/galipremsagar/cudf into 7367
galipremsagar Feb 24, 2021
8d06667
move datetime / duration isin logic to a common utility
galipremsagar Feb 24, 2021
032378d
Merge remote-tracking branch 'upstream/branch-0.19' into 7367
galipremsagar Feb 25, 2021
dd842f3
add atol
galipremsagar Feb 25, 2021
9fe44cd
rename internal api
galipremsagar Feb 25, 2021
da1a3a3
fix categorical setitem and allow np.nan into categories
galipremsagar Feb 26, 2021
e70686f
add nan setitem test
galipremsagar Feb 26, 2021
39ba07a
make null checks and to_pandas code flow more effecient
galipremsagar Feb 26, 2021
2cc496d
fix repr
galipremsagar Feb 26, 2021
0bd3bba
fix typo
galipremsagar Feb 26, 2021
3d44f5f
fix typo
galipremsagar Feb 26, 2021
c1c2d96
update index code
galipremsagar Feb 26, 2021
19ae2f6
Merge remote-tracking branch 'upstream/branch-0.19' into 7367
galipremsagar Feb 26, 2021
ae1b8c6
add packaging conda install
galipremsagar Feb 26, 2021
416bc92
Merge branch 'branch-0.19' into 7367
galipremsagar Feb 26, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 32 additions & 9 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,12 +946,14 @@ def unary_operator(self, unaryop: str):
)

def __setitem__(self, key, value):
to_add_categories = cudf.Index(value).difference(self.categories)
if cudf.utils.dtypes.is_scalar(
value
) and cudf._lib.scalar._is_null_host_scalar(value):
to_add_categories = []
else:
to_add_categories = cudf.Index(value).difference(self.categories)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

if (
len(to_add_categories)
and not to_add_categories.isna()._values.all()
):
if len(to_add_categories):
raise ValueError(
"Cannot setitem on a Categorical with a new "
"category, set the categories first"
Expand Down Expand Up @@ -1067,11 +1069,18 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
def to_pandas(
self, index: ColumnLike = None, nullable: bool = False, **kwargs
) -> pd.Series:
signed_dtype = min_signed_type(len(self.categories))
codes = self.cat().codes.astype(signed_dtype).fillna(-1).to_array()
categories = self.categories.to_pandas()

if self.categories.isnull().any():
col = self.copy(deep=True)
col[col.isnull()] = None
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved
else:
col = self

signed_dtype = min_signed_type(len(col.categories))
codes = col.cat().codes.astype(signed_dtype).fillna(-1).to_array()
categories = col.categories.dropna(drop_nan=True).to_pandas()
data = pd.Categorical.from_codes(
codes, categories=categories, ordered=self.ordered
codes, categories=categories, ordered=col.ordered
)
return pd.Series(data, index=index)

Expand Down Expand Up @@ -1201,6 +1210,20 @@ def find_and_replace(
ordered=self.dtype.ordered,
)

def isnull(self) -> ColumnBase:
"""Identify missing values in a Column.
"""
result = libcudf.unary.is_null(self)

if self.categories.dtype.kind == "f":
# Need to consider `np.nan` values incase
# of a float column
result = result | libcudf.unary.is_nan(
self.astype(self.categories.dtype)
)
galipremsagar marked this conversation as resolved.
Show resolved Hide resolved

return result

def fillna(
self, fill_value: Any = None, method: Any = None, dtype: Dtype = None
) -> CategoricalColumn:
Expand Down
12 changes: 7 additions & 5 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1063,14 +1063,14 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:

# columns include null index in factorization; remove:
if self.has_nulls:
cats = cats.dropna()
cats = cats._column.dropna(drop_nan=False)
min_type = min_unsigned_type(len(cats), 8)
labels = labels - 1
if np.dtype(min_type).itemsize < labels.dtype.itemsize:
labels = labels.astype(min_type)

return build_categorical_column(
categories=cats._column,
categories=cats,
codes=labels._column,
mask=self.mask,
ordered=ordered,
Expand Down Expand Up @@ -2077,9 +2077,11 @@ def _construct_array(
arbitrary = cupy.asarray(arbitrary, dtype=dtype)
except (TypeError, ValueError):
native_dtype = dtype
if dtype is None and pd.api.types.infer_dtype(arbitrary) in (
"mixed",
"mixed-integer",
if (
dtype is None
and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
and pd.api.types.infer_dtype(arbitrary)
in ("mixed", "mixed-integer",)
):
native_dtype = "object"
arbitrary = np.asarray(
Expand Down
15 changes: 14 additions & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1993,7 +1993,20 @@ def __repr__(self):
# utilize `Index.to_string` once it is implemented
# related issue : https://github.com/pandas-dev/pandas/issues/35389
if isinstance(preprocess, CategoricalIndex):
output = preprocess.to_pandas().__repr__()
if preprocess.categories.dtype.kind == "f":
output = (
preprocess.astype("str")
.to_pandas()
.astype("category")
.__repr__()
)
break_idx = output.find("ordered=")
output = (
output[:break_idx].replace("'", "") + output[break_idx:]
)
else:
output = preprocess.to_pandas().__repr__()

output = output.replace("nan", cudf._NA_REP)
elif preprocess._values.nullable:
output = self._clean_nulls_from_index().to_pandas().__repr__()
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,10 @@ def __setitem__(self, key, value):
else:
value = column.as_column(value)

if hasattr(value, "dtype") and pd.api.types.is_numeric_dtype(
value.dtype
if (
not is_categorical_dtype(self._sr._column.dtype)
and hasattr(value, "dtype")
and pd.api.types.is_numeric_dtype(value.dtype)
):
# normalize types if necessary:
if not pd.api.types.is_integer(key):
Expand Down
10 changes: 9 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1070,7 +1070,13 @@ def __repr__(self):
else get_option("display.min_rows")
)
show_dimensions = get_option("display.show_dimensions")
output = preprocess.to_pandas().to_string(
if preprocess._column.categories.dtype.kind == "f":
pd_series = (
preprocess.astype("str").to_pandas().astype("category")
)
else:
pd_series = preprocess.to_pandas()
output = pd_series.to_string(
name=self.name,
dtype=self.dtype,
min_rows=min_rows,
Expand All @@ -1085,6 +1091,8 @@ def __repr__(self):

if isinstance(preprocess._column, cudf.core.column.CategoricalColumn):
category_memory = lines[-1]
if preprocess._column.categories.dtype.kind == "f":
category_memory = category_memory.replace("'", "")
lines = lines[:-1]
if len(lines) > 1:
if lines[-1].startswith("Name: "):
Expand Down
28 changes: 28 additions & 0 deletions python/cudf/cudf/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -762,3 +762,31 @@ def test_categorical_assignment(data, cat_dtype):
pd_df.assign(cat_col=pd_categorical)
cd_df.assign(cat_col=pd_categorical)
assert_eq(pd_df, cd_df)


def test_categorical_allow_nan():
gs = cudf.Series([1, 2, np.nan, 10, np.nan, None], nan_as_null=False)
gs = gs.astype("category")
expected_codes = cudf.Series([0, 1, 3, 2, 3, None], dtype="uint8")
assert_eq(expected_codes, gs.cat.codes)

expected_categories = cudf.Index([1.0, 2.0, 10.0, np.nan], dtype="float64")
assert_eq(expected_categories, gs.cat.categories)

actual_ps = gs.to_pandas()
expected_ps = pd.Series(
[1.0, 2.0, np.nan, 10.0, np.nan, np.nan], dtype="category"
)
assert_eq(actual_ps, expected_ps)


def test_categorical_setitem_with_nan():
gs = cudf.Series(
[1, 2, np.nan, 10, np.nan, None], nan_as_null=False
).astype("category")
gs[[1, 3]] = np.nan

expected_series = cudf.Series(
[1, np.nan, np.nan, np.nan, np.nan, None], nan_as_null=False
).astype(gs.dtype)
assert_eq(gs, expected_series)
56 changes: 56 additions & 0 deletions python/cudf/cudf/tests/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1417,3 +1417,59 @@ def test_mulitIndex_null_repr(gdi, expected_repr):
actual_repr = gdi.__repr__()

assert actual_repr.split() == expected_repr.split()


def test_categorical_series_with_nan_repr():
series = cudf.Series(
[1, 2, np.nan, 10, np.nan, None], nan_as_null=False
).astype("category")

expected_repr = textwrap.dedent(
"""
0 1.0
1 2.0
2 NaN
3 10.0
4 NaN
5 <NA>
dtype: category
Categories (4, object): [1.0, 10.0, 2.0, NaN]
"""
)

assert series.__repr__().split() == expected_repr.split()


def test_categorical_dataframe_with_nan_repr():
series = cudf.Series(
[1, 2, np.nan, 10, np.nan, None], nan_as_null=False
).astype("category")
df = cudf.DataFrame({"a": series})
expected_repr = textwrap.dedent(
"""
a
0 1.0
1 2.0
2 NaN
3 10.0
4 NaN
5 <NA>
"""
)

assert df.__repr__().split() == expected_repr.split()


def test_categorical_index_with_nan_repr():
cat_index = cudf.Index(
cudf.Series(
[1, 2, np.nan, 10, np.nan, None], nan_as_null=False
).astype("category")
)

expected_repr = (
"CategoricalIndex([1.0, 2.0, NaN, 10.0, NaN, <NA>], "
"categories=[1.0, 10.0, 2.0, NaN], ordered=False, dtype='category')"
)

assert cat_index.__repr__() == expected_repr