Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add method argument to DataFrame.quantile #11957

Merged
merged 13 commits into from
Oct 28, 2022
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/cpp/quantiles.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020, NVIDIA CORPORATION.
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
Expand Down Expand Up @@ -27,7 +27,7 @@ cdef extern from "cudf/quantiles.hpp" namespace "cudf" nogil:
bool exact,
) except +

cdef unique_ptr[table] quantiles (
cdef unique_ptr[table] quantile_table (
table_view source_table,
vector[double] q,
interpolation interp,
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/_lib/quantiles.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.quantiles cimport (
quantile as cpp_quantile,
quantiles as cpp_quantiles,
quantiles as cpp_quantile_table,
)
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
Expand Down Expand Up @@ -74,7 +74,7 @@ def quantile(
return Column.from_unique_ptr(move(c_result))


def quantiles(list source_columns,
def quantile_table(list source_columns,
vector[double] q,
object interp,
object is_input_sorted,
Expand Down Expand Up @@ -108,7 +108,7 @@ def quantiles(list source_columns,

with nogil:
c_result = move(
cpp_quantiles(
cpp_quantile_table(
c_input,
c_q,
c_interp,
Expand Down
107 changes: 62 additions & 45 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5212,9 +5212,10 @@ def quantile(
q=0.5,
axis=0,
numeric_only=True,
interpolation="linear",
interpolation=None,
columns=None,
exact=True,
method="single",
):
"""
Return values at the given quantile.
Expand All @@ -5231,11 +5232,16 @@ def quantile(
interpolation : {`linear`, `lower`, `higher`, `midpoint`, `nearest`}
This parameter specifies the interpolation method to use,
when the desired quantile lies between two data points i and j.
Default ``linear``.
Default is ``linear`` for ``method="single"``, and ``nearest``
for ``method="table"``.
columns : list of str
List of column names to include.
exact : boolean
Whether to use approximate or exact quantile algorithm.
method : {`single`, `table`}, default `single`
Whether to compute quantiles per-column ('single') or over all
columns ('table'). When 'table', the only allowed interpolation
methods are 'nearest', 'lower', and 'higher'.

Returns
-------
Expand Down Expand Up @@ -5288,39 +5294,62 @@ def quantile(
if columns is None:
columns = data_df._data.names

# Ensure that qs is non-scalar so that we always get a column back.
qs = [q] if is_scalar(q) else q
result = {}
for k in data_df._data.names:
if k in columns:
ser = data_df[k]
res = ser.quantile(
qs,
interpolation=interpolation,
exact=exact,
quant_index=False,
)._column
if len(res) == 0:
res = column.column_empty_like(
qs, dtype=ser.dtype, masked=True, newsize=len(qs)
)
result[k] = res
if isinstance(q, numbers.Number):
q_is_number = True
qs = [float(q)]
elif pd.api.types.is_list_like(q):
q_is_number = False
qs = q
else:
msg = "`q` must be either a single element or list"
raise TypeError(msg)

result = DataFrame._from_data(result)
if isinstance(q, numbers.Number) and numeric_only:
result = result.fillna(np.nan).iloc[0]
result.index = data_df._data.to_pandas_index()
result.name = q
return result
if method == "table":
interpolation = interpolation or "nearest"
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
result = self._quantile_table(qs, interpolation.upper())

if q_is_number:
result = result.transpose()
return Series(
data=result._columns[0], index=result.index, name=q
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This produces a slight inconsistency with the other cases in that here the index is a cudf.core.index.StringIndex whereas otherwise it is a pandas.core.indexes.base.Index.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this change resolve the inconsistency? Side note that this isn't new code, but I'm certainly happy to fix things while we are making changes anyway.

)
else:
result.index = list(map(float, qs))
return result
# Ensure that qs is non-scalar so that we always get a column back.
interpolation = interpolation or "linear"
result = {}
for k in data_df._data.names:
if k in columns:
ser = data_df[k]
res = ser.quantile(
qs,
interpolation=interpolation,
exact=exact,
quant_index=False,
)._column
if len(res) == 0:
res = column.column_empty_like(
qs, dtype=ser.dtype, masked=True, newsize=len(qs)
)
result[k] = res
result = DataFrame._from_data(result)

if q_is_number and numeric_only:
result = result.fillna(np.nan).iloc[0]
result.index = data_df._data.to_pandas_index()
rjzamora marked this conversation as resolved.
Show resolved Hide resolved
result.name = q
return result

result.index = list(map(float, qs))
return result

@_cudf_nvtx_annotate
def quantiles(self, q=0.5, interpolation="nearest"):
"""
Return values at the given quantile.

This API is now deprecated. Please use ``DataFrame.quantile``
with ``method='table'``.

Parameters
----------
q : float or array-like
Expand All @@ -5334,25 +5363,13 @@ def quantiles(self, q=0.5, interpolation="nearest"):
-------
DataFrame
"""
if isinstance(q, numbers.Number):
q_is_number = True
q = [float(q)]
elif pd.api.types.is_list_like(q):
q_is_number = False
else:
msg = "`q` must be either a single element or list"
raise TypeError(msg)

result = self._quantiles(q, interpolation.upper())
warnings.warn(
"DataFrame.quantiles is not deprecated. "
bdice marked this conversation as resolved.
Show resolved Hide resolved
"Please use DataFrame.quantile with `method='table'`.",
FutureWarning,
)

if q_is_number:
result = result.transpose()
return Series(
data=result._columns[0], index=result.index, name=q[0]
)
else:
result.index = as_index(q)
return result
return self.quantile(q=q, interpolation=interpolation, method="table")

@_cudf_nvtx_annotate
def isin(self, values):
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
return self[out_cols]

@_cudf_nvtx_annotate
def _quantiles(
def _quantile_table(
self,
q,
interpolation="LINEAR",
Expand All @@ -972,7 +972,7 @@ def _quantiles(
]

return self._from_columns_like_self(
libcudf.quantiles.quantiles(
libcudf.quantiles.quantile_table(
[*self._columns],
q,
interpolation,
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/test_quantiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_single_q():
gdf = cudf.from_pandas(pdf)

pdf_q = pdf.quantile(q, interpolation="nearest")
gdf_q = gdf.quantiles(q, interpolation="nearest")
gdf_q = gdf.quantile(q, interpolation="nearest", method="table")

assert_eq(pdf_q, gdf_q, check_index_type=False)

Expand All @@ -28,7 +28,7 @@ def test_with_index():
gdf = cudf.from_pandas(pdf)

pdf_q = pdf.quantile(q, interpolation="nearest")
gdf_q = gdf.quantiles(q, interpolation="nearest")
gdf_q = gdf.quantile(q, interpolation="nearest", method="table")

assert_eq(pdf_q, gdf_q, check_index_type=False)

Expand All @@ -48,7 +48,7 @@ def test_with_multiindex():
gdf = cudf.from_pandas(pdf)

pdf_q = pdf.quantile(q, interpolation="nearest")
gdf_q = gdf.quantiles(q, interpolation="nearest")
gdf_q = gdf.quantile(q, interpolation="nearest", method="table")

assert_eq(pdf_q, gdf_q, check_index_type=False)

Expand Down
4 changes: 2 additions & 2 deletions python/dask_cudf/dask_cudf/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def _quantile(a, q):
n = len(a)
if not len(a):
return None, n
return (a.quantiles(q=q.tolist(), interpolation="nearest"), n)
return (a.quantile(q=q.tolist(), interpolation="nearest", method="table"), n)


@_dask_cudf_nvtx_annotate
Expand Down Expand Up @@ -133,7 +133,7 @@ def _approximate_quantile(df, q):
final_type = df._meta._constructor

# Create metadata
meta = df._meta_nonempty.quantiles(q=q)
meta = df._meta_nonempty.quantile(q=q, method="table")

# Define final action (create df with quantiles as index)
def finalize_tsk(tsk):
Expand Down