Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Support cupy array in quantile input #10429

Merged
merged 16 commits into from
Mar 21, 2022
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,11 @@ def append(self, other: ColumnBase) -> ColumnBase:
return concat_columns([self, as_column(other)])

def quantile(
self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
self,
q: np.ndarray,
interpolation: str,
exact: bool,
return_scalar: bool,
) -> ColumnBase:
raise TypeError(f"cannot perform quantile with type {self.dtype}")

Expand Down
14 changes: 10 additions & 4 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import locale
import re
from locale import nl_langinfo
from numbers import Number
from types import SimpleNamespace
from typing import Any, Mapping, Sequence, Union, cast

Expand Down Expand Up @@ -373,12 +372,19 @@ def median(self, skipna: bool = None) -> pd.Timestamp:
)

def quantile(
self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
self,
q: np.ndarray,
interpolation: str,
exact: bool,
return_scalar: bool,
) -> ColumnBase:
result = self.as_numerical.quantile(
q=q, interpolation=interpolation, exact=exact
q=q,
interpolation=interpolation,
exact=exact,
return_scalar=return_scalar,
)
if isinstance(q, Number):
if return_scalar:
return pd.Timestamp(result, unit=self.time_unit)
return result.astype(self.dtype)

Expand Down
43 changes: 28 additions & 15 deletions python/cudf/cudf/core/column/numerical_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

from __future__ import annotations

from numbers import Number
from typing import Sequence, Union
from typing import cast

import numpy as np

Expand Down Expand Up @@ -92,18 +91,28 @@ def skew(self, skipna: bool = None) -> ScalarLike:
return skew

def quantile(
self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
self,
q: np.ndarray,
interpolation: str,
exact: bool,
return_scalar: bool,
) -> NumericalBaseColumn:
if isinstance(q, Number) or cudf.api.types.is_list_like(q):
np_array_q = np.asarray(q)
if np.logical_or(np_array_q < 0, np_array_q > 1).any():
raise ValueError(
"percentiles should all be in the interval [0, 1]"
)
if np.logical_or(q < 0, q > 1).any():
raise ValueError(
"percentiles should all be in the interval [0, 1]"
)
# Beyond this point, q either being scalar or list-like
# will only have values in range [0, 1]
result = self._numeric_quantile(q, interpolation, exact)
if isinstance(q, Number):
if len(self) == 0:
result = cast(
NumericalBaseColumn,
cudf.core.column.column_empty(
row_count=len(q), dtype=self.dtype, masked=True
),
)
else:
result = self._numeric_quantile(q, interpolation, exact)
if return_scalar:
return (
cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
if result[0] is cudf.NA
Expand Down Expand Up @@ -137,20 +146,24 @@ def median(self, skipna: bool = None) -> NumericalBaseColumn:
return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)

# enforce linear in case the default ever changes
return self.quantile(0.5, interpolation="linear", exact=True)
return self.quantile(
np.array([0.5]),
interpolation="linear",
exact=True,
return_scalar=True,
)

def _numeric_quantile(
self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
self, q: np.ndarray, interpolation: str, exact: bool
) -> NumericalBaseColumn:
quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q
# get sorted indices and exclude nulls
sorted_indices = self.as_frame()._get_sorted_inds(
ascending=True, na_position="first"
)
sorted_indices = sorted_indices[self.null_count :]

return libcudf.quantiles.quantile(
self, quant, interpolation, sorted_indices, exact
self, q, interpolation, sorted_indices, exact
)

def cov(self, other: NumericalBaseColumn) -> float:
Expand Down
16 changes: 11 additions & 5 deletions python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
from __future__ import annotations

import datetime as dt
from numbers import Number
from typing import Any, Sequence, Tuple, Union, cast
from typing import Any, Sequence, Tuple, cast

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -330,12 +329,19 @@ def isin(self, values: Sequence) -> ColumnBase:
return cudf.core.tools.datetimes._isin_datetimelike(self, values)

def quantile(
self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
self,
q: np.ndarray,
interpolation: str,
exact: bool,
return_scalar: bool,
) -> "column.ColumnBase":
result = self.as_numerical.quantile(
q=q, interpolation=interpolation, exact=exact
q=q,
interpolation=interpolation,
exact=exact,
return_scalar=return_scalar,
)
if isinstance(q, Number):
if return_scalar:
return pd.Timedelta(result, unit=self.time_unit)
return result.astype(self.dtype)

Expand Down
34 changes: 21 additions & 13 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import pickle
import warnings
from collections import abc as abc
from numbers import Number
from shutil import get_terminal_size
from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Type, Union

Expand Down Expand Up @@ -2746,21 +2745,30 @@ def quantile(
dtype: float64
"""

result = self._column.quantile(q, interpolation, exact)
return_scalar = is_scalar(q)
if return_scalar:
np_array_q = np.asarray([float(q)])
else:
try:
np_array_q = np.asarray(q)
except TypeError:
try:
np_array_q = cudf.core.column.as_column(q).values_host
except TypeError:
raise TypeError(
f"q must be a scalar or array-like, got {type(q)}"
)

if isinstance(q, Number):
return result
result = self._column.quantile(
np_array_q, interpolation, exact, return_scalar=return_scalar
)

if quant_index:
index = np.asarray(q)
if len(self) == 0:
result = column_empty_like(
index, dtype=self.dtype, masked=True, newsize=len(index),
)
else:
index = None
if return_scalar:
return result

return Series(result, index=index, name=self.name)
return Series(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this use _from_data?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah updated to use _from_data

result, index=np_array_q if quant_index else None, name=self.name
)

@docutils.doc_describe()
@_cudf_nvtx_annotate
Expand Down
34 changes: 33 additions & 1 deletion python/cudf/cudf/tests/test_quantiles.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

import re

import pandas as pd
import pytest

import cudf
from cudf.testing._utils import assert_eq
from cudf.testing._utils import assert_eq, assert_exceptions_equal


def test_single_q():
Expand Down Expand Up @@ -46,3 +51,30 @@ def test_with_multiindex():
gdf_q = gdf.quantiles(q, interpolation="nearest")

assert_eq(pdf_q, gdf_q, check_index_type=False)


@pytest.mark.parametrize("q", [2, [1, 2, 3]])
def test_quantile_range_error(q):
ps = pd.Series([1, 2, 3])
gs = cudf.from_pandas(ps)
assert_exceptions_equal(
lfunc=ps.quantile,
rfunc=gs.quantile,
lfunc_args_and_kwargs=([q],),
rfunc_args_and_kwargs=([q],),
expected_error_message=re.escape(
"percentiles should all be in the interval [0, 1]"
),
)


def test_quantile_q_type():
gs = cudf.Series([1, 2, 3])
with pytest.raises(
TypeError,
match=re.escape(
"q must be a scalar or array-like, got <class "
"'cudf.core.dataframe.DataFrame'>"
),
):
gs.quantile(cudf.DataFrame())
17 changes: 15 additions & 2 deletions python/cudf/cudf/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from concurrent.futures import ThreadPoolExecutor

import cupy as cp
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -201,13 +202,25 @@ def test_approx_quantiles_int():


@pytest.mark.parametrize("data", [[], [1, 2, 3, 10, 326497]])
@pytest.mark.parametrize("q", [[], 0.5, 1, 0.234, [0.345], [0.243, 0.5, 1]])
@pytest.mark.parametrize(
"q",
[
[],
0.5,
1,
0.234,
[0.345],
[0.243, 0.5, 1],
np.array([0.5, 1]),
cp.array([0.5, 1]),
],
)
def test_misc_quantiles(data, q):

pdf_series = cudf.utils.utils._create_pandas_series(data=data)
gdf_series = cudf.Series(data)

expected = pdf_series.quantile(q)
expected = pdf_series.quantile(q.get() if isinstance(q, cp.ndarray) else q)
vyasr marked this conversation as resolved.
Show resolved Hide resolved
actual = gdf_series.quantile(q)
assert_eq(expected, actual)

Expand Down