diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx index f65c29a55a8..6cf2d22f978 100644 --- a/python/cudf/cudf/_lib/quantiles.pyx +++ b/python/cudf/cudf/_lib/quantiles.pyx @@ -20,7 +20,7 @@ from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.quantiles cimport ( quantile as cpp_quantile, - quantiles as cpp_quantiles, + quantiles as cpp_quantile_table, ) from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -74,12 +74,14 @@ def quantile( return Column.from_unique_ptr(move(c_result)) -def quantiles(list source_columns, - vector[double] q, - object interp, - object is_input_sorted, - list column_order, - list null_precedence): +def quantile_table( + list source_columns, + vector[double] q, + object interp, + object is_input_sorted, + list column_order, + list null_precedence, +): cdef table_view c_input = table_view_from_columns(source_columns) cdef vector[double] c_q = q cdef interpolation c_interp = ( @@ -108,13 +110,13 @@ def quantiles(list source_columns, with nogil: c_result = move( - cpp_quantiles( + cpp_quantile_table( c_input, c_q, c_interp, c_is_input_sorted, c_column_order, - c_null_precedence + c_null_precedence, ) ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 02c5542a88a..92ca5148c1e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5195,9 +5195,10 @@ def quantile( q=0.5, axis=0, numeric_only=True, - interpolation="linear", + interpolation=None, columns=None, exact=True, + method="single", ): """ Return values at the given quantile. @@ -5214,11 +5215,16 @@ def quantile( interpolation : {`linear`, `lower`, `higher`, `midpoint`, `nearest`} This parameter specifies the interpolation method to use, when the desired quantile lies between two data points i and j. - Default ``linear``. + Default is ``linear`` for ``method="single"``, and ``nearest`` + for ``method="table"``. columns : list of str List of column names to include. exact : boolean Whether to use approximate or exact quantile algorithm. + method : {`single`, `table`}, default `single` + Whether to compute quantiles per-column ('single') or over all + columns ('table'). When 'table', the only allowed interpolation + methods are 'nearest', 'lower', and 'higher'. Returns ------- @@ -5271,39 +5277,62 @@ def quantile( if columns is None: columns = data_df._data.names - # Ensure that qs is non-scalar so that we always get a column back. - qs = [q] if is_scalar(q) else q - result = {} - for k in data_df._data.names: - if k in columns: - ser = data_df[k] - res = ser.quantile( - qs, - interpolation=interpolation, - exact=exact, - quant_index=False, - )._column - if len(res) == 0: - res = column.column_empty_like( - qs, dtype=ser.dtype, masked=True, newsize=len(qs) - ) - result[k] = res + if isinstance(q, numbers.Number): + q_is_number = True + qs = [float(q)] + elif pd.api.types.is_list_like(q): + q_is_number = False + qs = q + else: + msg = "`q` must be either a single element or list" + raise TypeError(msg) - result = DataFrame._from_data(result) - if isinstance(q, numbers.Number) and numeric_only: - result = result.fillna(np.nan).iloc[0] - result.index = data_df._data.to_pandas_index() - result.name = q - return result + if method == "table": + interpolation = interpolation or "nearest" + result = self._quantile_table(qs, interpolation.upper()) + + if q_is_number: + result = result.transpose() + return Series( + data=result._columns[0], index=result.index, name=q + ) else: - result.index = list(map(float, qs)) - return result + # Ensure that qs is non-scalar so that we always get a column back. + interpolation = interpolation or "linear" + result = {} + for k in data_df._data.names: + if k in columns: + ser = data_df[k] + res = ser.quantile( + qs, + interpolation=interpolation, + exact=exact, + quant_index=False, + )._column + if len(res) == 0: + res = column.column_empty_like( + qs, dtype=ser.dtype, masked=True, newsize=len(qs) + ) + result[k] = res + result = DataFrame._from_data(result) + + if q_is_number and numeric_only: + result = result.fillna(np.nan).iloc[0] + result.index = data_df.keys() + result.name = q + return result + + result.index = list(map(float, qs)) + return result @_cudf_nvtx_annotate def quantiles(self, q=0.5, interpolation="nearest"): """ Return values at the given quantile. + This API is now deprecated. Please use ``DataFrame.quantile`` + with ``method='table'``. + Parameters ---------- q : float or array-like @@ -5317,25 +5346,13 @@ def quantiles(self, q=0.5, interpolation="nearest"): ------- DataFrame """ - if isinstance(q, numbers.Number): - q_is_number = True - q = [float(q)] - elif pd.api.types.is_list_like(q): - q_is_number = False - else: - msg = "`q` must be either a single element or list" - raise TypeError(msg) - - result = self._quantiles(q, interpolation.upper()) + warnings.warn( + "DataFrame.quantiles is now deprecated. " + "Please use DataFrame.quantile with `method='table'`.", + FutureWarning, + ) - if q_is_number: - result = result.transpose() - return Series( - data=result._columns[0], index=result.index, name=q[0] - ) - else: - result.index = as_index(q) - return result + return self.quantile(q=q, interpolation=interpolation, method="table") @_cudf_nvtx_annotate def isin(self, values): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9e539ee157b..12c53ae258d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -953,7 +953,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): return self[out_cols] @_cudf_nvtx_annotate - def _quantiles( + def _quantile_table( self, q, interpolation="LINEAR", @@ -972,7 +972,7 @@ def _quantiles( ] return self._from_columns_like_self( - libcudf.quantiles.quantiles( + libcudf.quantiles.quantile_table( [*self._columns], q, interpolation, diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py index c4e3f690d13..72b36d8a1a6 100644 --- a/python/cudf/cudf/tests/test_quantiles.py +++ b/python/cudf/cudf/tests/test_quantiles.py @@ -16,7 +16,7 @@ def test_single_q(): gdf = cudf.from_pandas(pdf) pdf_q = pdf.quantile(q, interpolation="nearest") - gdf_q = gdf.quantiles(q, interpolation="nearest") + gdf_q = gdf.quantile(q, interpolation="nearest", method="table") assert_eq(pdf_q, gdf_q, check_index_type=False) @@ -28,7 +28,7 @@ def test_with_index(): gdf = cudf.from_pandas(pdf) pdf_q = pdf.quantile(q, interpolation="nearest") - gdf_q = gdf.quantiles(q, interpolation="nearest") + gdf_q = gdf.quantile(q, interpolation="nearest", method="table") assert_eq(pdf_q, gdf_q, check_index_type=False) @@ -48,7 +48,7 @@ def test_with_multiindex(): gdf = cudf.from_pandas(pdf) pdf_q = pdf.quantile(q, interpolation="nearest") - gdf_q = gdf.quantiles(q, interpolation="nearest") + gdf_q = gdf.quantile(q, interpolation="nearest", method="table") assert_eq(pdf_q, gdf_q, check_index_type=False) diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index 4c2372393e5..0f2dc0d4efc 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -48,7 +48,10 @@ def _quantile(a, q): n = len(a) if not len(a): return None, n - return (a.quantiles(q=q.tolist(), interpolation="nearest"), n) + return ( + a.quantile(q=q.tolist(), interpolation="nearest", method="table"), + n, + ) @_dask_cudf_nvtx_annotate @@ -133,7 +136,7 @@ def _approximate_quantile(df, q): final_type = df._meta._constructor # Create metadata - meta = df._meta_nonempty.quantiles(q=q) + meta = df._meta_nonempty.quantile(q=q, method="table") # Define final action (create df with quantiles as index) def finalize_tsk(tsk):