Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix __repr__ for categorical dtype #7476

Merged
merged 5 commits into from
Mar 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5808,7 +5808,7 @@ def _prepare_for_rowwise_op(self, method, skipna):
is_pure_dt = all(is_datetime_dtype(dt) for dt in self.dtypes)

if not is_pure_dt:
filtered = self.select_dtypes(include=[np.number, np.bool])
filtered = self.select_dtypes(include=[np.number, np.bool_])
else:
filtered = self.copy(deep=False)

Expand Down Expand Up @@ -6587,8 +6587,8 @@ def kurtosis(
msg = "Kurtosis only supports int, float, and bool dtypes."
raise NotImplementedError(msg)

self = self.select_dtypes(include=[np.number, np.bool])
return self._apply_support_method(
filtered = self.select_dtypes(include=[np.number, np.bool_])
return filtered._apply_support_method(
"kurtosis",
axis=axis,
skipna=skipna,
Expand Down Expand Up @@ -6636,8 +6636,8 @@ def skew(
msg = "Skew only supports int, float, and bool dtypes."
raise NotImplementedError(msg)

self = self.select_dtypes(include=[np.number, np.bool])
return self._apply_support_method(
filtered = self.select_dtypes(include=[np.number, np.bool_])
return filtered._apply_support_method(
"skew",
axis=axis,
skipna=skipna,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1195,7 +1195,7 @@ def scatter_by_map(
map_index = as_column(map_index)

# Convert float to integer
if map_index.dtype == np.float:
if map_index.dtype.kind == "f":
map_index = map_index.astype(np.int32)

# Convert string or categorical to integer
Expand Down
9 changes: 8 additions & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1997,7 +1997,14 @@ def __repr__(self):
output = (
preprocess.astype("str")
.to_pandas()
.astype("category")
.astype(
dtype=pd.CategoricalDtype(
categories=preprocess.dtype.categories.astype(
"str"
).to_pandas(),
ordered=preprocess.dtype.ordered,
)
)
.__repr__()
)
break_idx = output.find("ordered=")
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def _loc_to_iloc(self, arg):

else:
arg = Series(column.as_column(arg))
if arg.dtype in [np.bool, np.bool_]:
if arg.dtype in (bool, np.bool_):
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
return arg
else:
indices = indices_from_labels(self._sr, arg)
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/core/scalar.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

import numpy as np

from cudf._lib.scalar import DeviceScalar, _is_null_host_scalar
Expand Down Expand Up @@ -251,7 +252,7 @@ def __repr__(self):

def _binop_result_dtype_or_error(self, other, op):
if op in {"__eq__", "__ne__", "__lt__", "__gt__", "__le__", "__ge__"}:
return np.bool
return np.bool_

out_dtype = get_allowed_combinations_for_operator(
self.dtype, other.dtype, op
Expand Down
11 changes: 10 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1072,7 +1072,16 @@ def __repr__(self):
show_dimensions = get_option("display.show_dimensions")
if preprocess._column.categories.dtype.kind == "f":
pd_series = (
preprocess.astype("str").to_pandas().astype("category")
preprocess.astype("str")
.to_pandas()
.astype(
dtype=pd.CategoricalDtype(
categories=preprocess.dtype.categories.astype(
"str"
).to_pandas(),
ordered=preprocess.dtype.ordered,
)
)
)
else:
pd_series = preprocess.to_pandas()
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_applymap.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018, NVIDIA CORPORATION.
# Copyright (c) 2018-2021, NVIDIA CORPORATION.

from itertools import product
from math import floor
Expand All @@ -22,7 +22,7 @@ def test_applymap_round(nelem, masked):
# Make mask
bitmask = utils.random_bitmask(nelem)
boolmask = np.asarray(
utils.expand_bits_to_bytes(bitmask), dtype=np.bool
utils.expand_bits_to_bytes(bitmask), dtype=np.bool_
)[:nelem]
data[~boolmask] = np.nan

Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/test_binops.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,15 +290,15 @@ def test_validity_add(nelem, lhs_nulls, rhs_nulls):
res = lhs + rhs
if lhs_nulls == "some" and rhs_nulls == "some":
res_mask = np.asarray(
utils.expand_bits_to_bytes(lhs_mask & rhs_mask), dtype=np.bool
utils.expand_bits_to_bytes(lhs_mask & rhs_mask), dtype=np.bool_
)[:nelem]
if lhs_nulls == "some" and rhs_nulls == "none":
res_mask = np.asarray(
utils.expand_bits_to_bytes(lhs_mask), dtype=np.bool
utils.expand_bits_to_bytes(lhs_mask), dtype=np.bool_
)[:nelem]
if lhs_nulls == "none" and rhs_nulls == "some":
res_mask = np.asarray(
utils.expand_bits_to_bytes(rhs_mask), dtype=np.bool
utils.expand_bits_to_bytes(rhs_mask), dtype=np.bool_
)[:nelem]
# Fill NA values
na_value = -10000
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,7 @@ def test_dataframe_concat_different_numerical_columns(dtype1, dtype2):


def test_dataframe_concat_different_column_types():
df1 = cudf.Series([42], dtype=np.float)
df1 = cudf.Series([42], dtype=np.float64)
df2 = cudf.Series(["a"], dtype="category")
with pytest.raises(ValueError):
cudf.concat([df1, df2])
Expand Down Expand Up @@ -2353,7 +2353,7 @@ def check_frame_series_equality(left, right):

def test_tail_for_string():
gdf = cudf.DataFrame()
gdf["id"] = cudf.Series(["a", "b"], dtype=np.object)
gdf["id"] = cudf.Series(["a", "b"], dtype=np.object_)
gdf["v"] = cudf.Series([1, 2])
assert_eq(gdf.tail(3), gdf.to_pandas().tail(3))

Expand Down
8 changes: 4 additions & 4 deletions python/cudf/cudf/tests/test_joining.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,17 +821,17 @@ def test_join_empty_table_dtype():
"column_a",
[
(
pd.Series([None, 1, 2, 3, 4, 5, 6, 7]).astype(np.float),
pd.Series([8, 9, 10, 11, 12, None, 14, 15]).astype(np.float),
pd.Series([None, 1, 2, 3, 4, 5, 6, 7], dtype=np.float64),
pd.Series([8, 9, 10, 11, 12, None, 14, 15], dtype=np.float64),
)
],
)
@pytest.mark.parametrize(
"column_b",
[
(
pd.Series([0, 1, 0, None, 1, 0, 0, 0]).astype(np.float),
pd.Series([None, 1, 2, 1, 2, 2, 0, 0]).astype(np.float),
pd.Series([0, 1, 0, None, 1, 0, 0, 0], dtype=np.float64),
pd.Series([None, 1, 2, 1, 2, 2, 0, 0], dtype=np.float64),
)
],
)
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/test_pandas_interop.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2018-2020, NVIDIA CORPORATION.
# Copyright (c) 2018-2021, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
Expand All @@ -23,8 +23,8 @@ def test_to_pandas():

# Notice, the dtype differ when Pandas and cudf boolean series
# contains None/NaN
assert df["c"].dtype == np.bool
assert pdf["c"].dtype == np.object
assert df["c"].dtype == np.bool_
assert pdf["c"].dtype == np.object_

assert len(df["a"]) == len(pdf["a"])
assert len(df["b"]) == len(pdf["b"])
Expand Down
24 changes: 22 additions & 2 deletions python/cudf/cudf/tests/test_repr.py
Original file line number Diff line number Diff line change
Expand Up @@ -1433,12 +1433,25 @@ def test_categorical_series_with_nan_repr():
4 NaN
5 <NA>
dtype: category
Categories (4, float64): [1.0, 10.0, 2.0, NaN]
Categories (4, float64): [1.0, 2.0, 10.0, NaN]
"""
)

assert series.__repr__().split() == expected_repr.split()

sliced_expected_repr = textwrap.dedent(
"""
2 NaN
3 10.0
4 NaN
5 <NA>
dtype: category
Categories (4, float64): [1.0, 2.0, 10.0, NaN]
"""
)

assert series[2:].__repr__().split() == sliced_expected_repr.split()


def test_categorical_dataframe_with_nan_repr():
series = cudf.Series(
Expand Down Expand Up @@ -1469,7 +1482,14 @@ def test_categorical_index_with_nan_repr():

expected_repr = (
"CategoricalIndex([1.0, 2.0, NaN, 10.0, NaN, <NA>], "
"categories=[1.0, 10.0, 2.0, NaN], ordered=False, dtype='category')"
"categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')"
)

assert cat_index.__repr__() == expected_repr

sliced_expected_repr = (
"CategoricalIndex([NaN, 10.0, NaN, <NA>], "
"categories=[1.0, 2.0, 10.0, NaN], ordered=False, dtype='category')"
)

assert cat_index[2:].__repr__() == sliced_expected_repr
6 changes: 4 additions & 2 deletions python/cudf/cudf/tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2021, NVIDIA CORPORATION.

import re
from collections.abc import Mapping, Sequence
Expand Down Expand Up @@ -259,7 +259,9 @@ def gen_rand(dtype, size, **kwargs):
elif dtype.kind == "b":
low = kwargs.get("low", 0)
high = kwargs.get("high", 2)
return np.random.randint(low=low, high=high, size=size).astype(np.bool)
return np.random.randint(low=low, high=high, size=size).astype(
np.bool_
)
elif dtype.kind == "M":
low = kwargs.get("low", 0)
time_unit, _ = np.datetime_data(dtype)
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ def wrapper(*args, **kwargs):
return wrapper


def get_null_series(size, dtype=np.bool):
def get_null_series(size, dtype=np.bool_):
"""
Creates a null series of provided dtype and size

Expand Down