Skip to content

Commit

Permalink
Add method argument to DataFrame.quantile (#11957)
Browse files Browse the repository at this point in the history
Adds a `method` argument to `Dataframe.quantile` to match pandas behavior. Also deprecates `DataFrame.quantiles` (with a `FutureWarning` informing the user of the `method` argument).

Closes #11572

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: #11957
  • Loading branch information
rjzamora authored Oct 28, 2022
1 parent aaf251d commit 7620fb1
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 61 deletions.
20 changes: 11 additions & 9 deletions python/cudf/cudf/_lib/quantiles.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.quantiles cimport (
quantile as cpp_quantile,
quantiles as cpp_quantiles,
quantiles as cpp_quantile_table,
)
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
Expand Down Expand Up @@ -74,12 +74,14 @@ def quantile(
return Column.from_unique_ptr(move(c_result))


def quantiles(list source_columns,
vector[double] q,
object interp,
object is_input_sorted,
list column_order,
list null_precedence):
def quantile_table(
list source_columns,
vector[double] q,
object interp,
object is_input_sorted,
list column_order,
list null_precedence,
):
cdef table_view c_input = table_view_from_columns(source_columns)
cdef vector[double] c_q = q
cdef interpolation c_interp = <interpolation>(
Expand Down Expand Up @@ -108,13 +110,13 @@ def quantiles(list source_columns,

with nogil:
c_result = move(
cpp_quantiles(
cpp_quantile_table(
c_input,
c_q,
c_interp,
c_is_input_sorted,
c_column_order,
c_null_precedence
c_null_precedence,
)
)

Expand Down
107 changes: 62 additions & 45 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5195,9 +5195,10 @@ def quantile(
q=0.5,
axis=0,
numeric_only=True,
interpolation="linear",
interpolation=None,
columns=None,
exact=True,
method="single",
):
"""
Return values at the given quantile.
Expand All @@ -5214,11 +5215,16 @@ def quantile(
interpolation : {`linear`, `lower`, `higher`, `midpoint`, `nearest`}
This parameter specifies the interpolation method to use,
when the desired quantile lies between two data points i and j.
Default ``linear``.
Default is ``linear`` for ``method="single"``, and ``nearest``
for ``method="table"``.
columns : list of str
List of column names to include.
exact : boolean
Whether to use approximate or exact quantile algorithm.
method : {`single`, `table`}, default `single`
Whether to compute quantiles per-column ('single') or over all
columns ('table'). When 'table', the only allowed interpolation
methods are 'nearest', 'lower', and 'higher'.
Returns
-------
Expand Down Expand Up @@ -5271,39 +5277,62 @@ def quantile(
if columns is None:
columns = data_df._data.names

# Ensure that qs is non-scalar so that we always get a column back.
qs = [q] if is_scalar(q) else q
result = {}
for k in data_df._data.names:
if k in columns:
ser = data_df[k]
res = ser.quantile(
qs,
interpolation=interpolation,
exact=exact,
quant_index=False,
)._column
if len(res) == 0:
res = column.column_empty_like(
qs, dtype=ser.dtype, masked=True, newsize=len(qs)
)
result[k] = res
if isinstance(q, numbers.Number):
q_is_number = True
qs = [float(q)]
elif pd.api.types.is_list_like(q):
q_is_number = False
qs = q
else:
msg = "`q` must be either a single element or list"
raise TypeError(msg)

result = DataFrame._from_data(result)
if isinstance(q, numbers.Number) and numeric_only:
result = result.fillna(np.nan).iloc[0]
result.index = data_df._data.to_pandas_index()
result.name = q
return result
if method == "table":
interpolation = interpolation or "nearest"
result = self._quantile_table(qs, interpolation.upper())

if q_is_number:
result = result.transpose()
return Series(
data=result._columns[0], index=result.index, name=q
)
else:
result.index = list(map(float, qs))
return result
# Ensure that qs is non-scalar so that we always get a column back.
interpolation = interpolation or "linear"
result = {}
for k in data_df._data.names:
if k in columns:
ser = data_df[k]
res = ser.quantile(
qs,
interpolation=interpolation,
exact=exact,
quant_index=False,
)._column
if len(res) == 0:
res = column.column_empty_like(
qs, dtype=ser.dtype, masked=True, newsize=len(qs)
)
result[k] = res
result = DataFrame._from_data(result)

if q_is_number and numeric_only:
result = result.fillna(np.nan).iloc[0]
result.index = data_df.keys()
result.name = q
return result

result.index = list(map(float, qs))
return result

@_cudf_nvtx_annotate
def quantiles(self, q=0.5, interpolation="nearest"):
"""
Return values at the given quantile.
This API is now deprecated. Please use ``DataFrame.quantile``
with ``method='table'``.
Parameters
----------
q : float or array-like
Expand All @@ -5317,25 +5346,13 @@ def quantiles(self, q=0.5, interpolation="nearest"):
-------
DataFrame
"""
if isinstance(q, numbers.Number):
q_is_number = True
q = [float(q)]
elif pd.api.types.is_list_like(q):
q_is_number = False
else:
msg = "`q` must be either a single element or list"
raise TypeError(msg)

result = self._quantiles(q, interpolation.upper())
warnings.warn(
"DataFrame.quantiles is now deprecated. "
"Please use DataFrame.quantile with `method='table'`.",
FutureWarning,
)

if q_is_number:
result = result.transpose()
return Series(
data=result._columns[0], index=result.index, name=q[0]
)
else:
result.index = as_index(q)
return result
return self.quantile(q=q, interpolation=interpolation, method="table")

@_cudf_nvtx_annotate
def isin(self, values):
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -953,7 +953,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
return self[out_cols]

@_cudf_nvtx_annotate
def _quantiles(
def _quantile_table(
self,
q,
interpolation="LINEAR",
Expand All @@ -972,7 +972,7 @@ def _quantiles(
]

return self._from_columns_like_self(
libcudf.quantiles.quantiles(
libcudf.quantiles.quantile_table(
[*self._columns],
q,
interpolation,
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/test_quantiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def test_single_q():
gdf = cudf.from_pandas(pdf)

pdf_q = pdf.quantile(q, interpolation="nearest")
gdf_q = gdf.quantiles(q, interpolation="nearest")
gdf_q = gdf.quantile(q, interpolation="nearest", method="table")

assert_eq(pdf_q, gdf_q, check_index_type=False)

Expand All @@ -28,7 +28,7 @@ def test_with_index():
gdf = cudf.from_pandas(pdf)

pdf_q = pdf.quantile(q, interpolation="nearest")
gdf_q = gdf.quantiles(q, interpolation="nearest")
gdf_q = gdf.quantile(q, interpolation="nearest", method="table")

assert_eq(pdf_q, gdf_q, check_index_type=False)

Expand All @@ -48,7 +48,7 @@ def test_with_multiindex():
gdf = cudf.from_pandas(pdf)

pdf_q = pdf.quantile(q, interpolation="nearest")
gdf_q = gdf.quantiles(q, interpolation="nearest")
gdf_q = gdf.quantile(q, interpolation="nearest", method="table")

assert_eq(pdf_q, gdf_q, check_index_type=False)

Expand Down
7 changes: 5 additions & 2 deletions python/dask_cudf/dask_cudf/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,10 @@ def _quantile(a, q):
n = len(a)
if not len(a):
return None, n
return (a.quantiles(q=q.tolist(), interpolation="nearest"), n)
return (
a.quantile(q=q.tolist(), interpolation="nearest", method="table"),
n,
)


@_dask_cudf_nvtx_annotate
Expand Down Expand Up @@ -133,7 +136,7 @@ def _approximate_quantile(df, q):
final_type = df._meta._constructor

# Create metadata
meta = df._meta_nonempty.quantiles(q=q)
meta = df._meta_nonempty.quantile(q=q, method="table")

# Define final action (create df with quantiles as index)
def finalize_tsk(tsk):
Expand Down

0 comments on commit 7620fb1

Please sign in to comment.