From 07aa58135852a84f8cfe8c71b1ce158c8d249a14 Mon Sep 17 00:00:00 2001 From: Touutae-lab Date: Fri, 3 Feb 2023 22:33:21 +0700 Subject: [PATCH] Convert note --- python/cudf/cudf/core/column/lists.py | 10 +- python/cudf/cudf/core/column/string.py | 98 ++++++------ python/cudf/cudf/core/dataframe.py | 189 ++++++++++++----------- python/cudf/cudf/core/frame.py | 118 +++++++------- python/cudf/cudf/core/groupby/groupby.py | 19 +-- python/cudf/cudf/core/indexed_frame.py | 93 +++++------ python/cudf/cudf/core/series.py | 62 ++++---- python/cudf/cudf/core/tools/datetimes.py | 3 +- python/cudf/cudf/core/tools/numeric.py | 18 ++- python/cudf/cudf/tests/test_dataframe.py | 53 ++++--- python/dask_cudf/dask_cudf/accessors.py | 12 +- python/dask_cudf/dask_cudf/io/parquet.py | 5 +- 12 files changed, 369 insertions(+), 311 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 05ff4acde09..5e290e21a88 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -642,11 +642,6 @@ def sort_values( ------- Series or Index with each list sorted - Notes - ----- - Difference from pandas: - * Not supporting: `inplace`, `kind` - Examples -------- >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) @@ -655,6 +650,11 @@ def sort_values( 1 [2.0, 8.0, 8.0] 2 [1.0, 2.0] dtype: list + + .. pandas-compat:: + **list.ListMethods.sort_values** + + The ``inplace`` and ``kind`` arguments are currently not supported. """ if inplace: raise NotImplementedError("`inplace` not currently implemented.") diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9c30585a541..79082779e0b 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -596,11 +596,6 @@ def extract( for each group. If `expand=False` and `pat` has only one capture group, then return a Series/Index. - Notes - ----- - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - Examples -------- >>> import cudf @@ -627,6 +622,12 @@ def extract( 1 2 2 dtype: object + + .. pandas-compat:: + **StringMethods.extract** + + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. """ # noqa W605 if not _is_supported_regex_flags(flags): raise NotImplementedError( @@ -674,14 +675,6 @@ def contains( pattern is contained within the string of each element of the Series/Index. - Notes - ----- - The parameters `case` and `na` are not yet supported and will - raise a NotImplementedError if anything other than the default - value is set. - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - Examples -------- >>> import cudf @@ -755,6 +748,15 @@ def contains( 3 True 4 dtype: bool + + .. pandas-compat:: + **StringMethods.contains** + + The parameters `case` and `na` are not yet supported and will + raise a NotImplementedError if anything other than the default + value is set. + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. """ # noqa W605 if case is not True: raise NotImplementedError("`case` parameter is not yet supported") @@ -946,12 +948,6 @@ def replace( A copy of the object with all matching occurrences of pat replaced by repl. - Notes - ----- - The parameters `case` and `flags` are not yet supported and will raise - a `NotImplementedError` if anything other than the default value - is set. - Examples -------- >>> import cudf @@ -981,6 +977,13 @@ def replace( 1 fuz 2 dtype: object + + .. pandas-compat:: + **StringMethods.replace** + + The parameters `case` and `flags` are not yet supported and will raise + a `NotImplementedError` if anything other than the default value + is set. """ if case is not None: raise NotImplementedError("`case` parameter is not yet supported") @@ -2767,11 +2770,6 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: DataFrame or MultiIndex Returns a DataFrame / MultiIndex - Notes - ----- - The parameter `expand` is not yet supported and will raise a - `NotImplementedError` if anything other than the default value is set. - See Also -------- rpartition @@ -2813,6 +2811,12 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: MultiIndex([('X', ' ', '123'), ('Y', ' ', '999')], ) + + .. pandas-compat:: + **StringMethods.partition** + + The parameter `expand` is not yet supported and will raise a + `NotImplementedError` if anything other than the default value is set. """ if expand is not True: raise NotImplementedError( @@ -3500,11 +3504,11 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex: Notes ----- - - `flags` parameter currently only supports re.DOTALL - and re.MULTILINE. - - Some characters need to be escaped when passing - in pat. e.g. ``'$'`` has a special meaning in regex - and must be escaped when finding this literal character. + - `flags` parameter currently only supports re.DOTALL + and re.MULTILINE. + - Some characters need to be escaped when passing + in pat. e.g. ``'$'`` has a special meaning in regex + and must be escaped when finding this literal character. Examples -------- @@ -3568,11 +3572,6 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: All non-overlapping matches of pattern or regular expression in each string of this Series/Index. - Notes - ----- - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - Examples -------- >>> import cudf @@ -3613,6 +3612,12 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex: 1 [] 2 [b, b] dtype: list + + .. pandas-compat:: + **StringMethods.findall** + + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. """ if isinstance(pat, re.Pattern): flags = pat.flags & ~re.U @@ -3795,11 +3800,6 @@ def endswith(self, pat: str) -> SeriesOrIndex: A Series of booleans indicating whether the given pattern matches the end of each string element. - Notes - ----- - `na` parameter is not yet supported, as cudf uses - native strings instead of Python objects. - Examples -------- >>> import cudf @@ -3816,6 +3816,12 @@ def endswith(self, pat: str) -> SeriesOrIndex: 2 False 3 dtype: bool + + .. pandas-compat:: + **StringMethods.endswith** + + `na` parameter is not yet supported, as cudf uses + native strings instead of Python objects. """ if pat is None: raise TypeError( @@ -4241,13 +4247,6 @@ def match( ------- Series or Index of boolean values. - Notes - ----- - Parameters `case` and `na` are currently not supported. - The `flags` parameter currently only supports re.DOTALL and - re.MULTILINE. - - Examples -------- >>> import cudf @@ -4268,6 +4267,13 @@ def match( 1 True 2 True dtype: bool + + .. pandas-compat:: + **StringMethods.match** + + Parameters `case` and `na` are currently not supported. + The `flags` parameter currently only supports re.DOTALL and + re.MULTILINE. """ if case is not True: raise NotImplementedError("`case` parameter is not yet supported") diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 12fdd07984a..ab795f3a833 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3118,10 +3118,6 @@ def diff(self, periods=1, axis=0): DataFrame First differences of the DataFrame. - Notes - ----- - Diff currently only supports numeric dtype columns. - Examples -------- >>> import cudf @@ -3145,6 +3141,10 @@ def diff(self, periods=1, axis=0): 4 2 3 16 5 2 5 20 + .. pandas-compat:: + **DataFrame.diff** + + Diff currently only supports numeric dtype columns. """ if not is_integer(periods): if not (is_float(periods) and periods.is_integer()): @@ -3310,14 +3310,6 @@ def rename( ------- DataFrame - Notes - ----- - Difference from pandas: - * Not supporting: level - - Rename will not overwrite column names. If a list with duplicates is - passed, column names will be postfixed with a number. - Examples -------- >>> import cudf @@ -3343,6 +3335,14 @@ def rename( 10 1 4 20 2 5 30 3 6 + + .. pandas-compat:: + **DataFrame.rename** + + * Not Supporting: level + + Rename will not overwrite column names. If a list with duplicates is + passed, column names will be postfixed with a number. """ if errors != "ignore": raise NotImplementedError( @@ -3442,10 +3442,10 @@ def agg(self, aggs, axis=None): When ``DataFrame.agg`` is called with several aggs, ``DataFrame`` is returned. - Notes - ----- - Difference from pandas: - * Not supporting: ``axis``, ``*args``, ``**kwargs`` + .. pandas-compat:: + **DataFrame.agg** + + * Not supporting: ``axis``, ``*args``, ``**kwargs`` """ # TODO: Remove the typecasting below once issue #6846 is fixed @@ -3578,11 +3578,6 @@ def nlargest(self, n, columns, keep="first"): The first `n` rows ordered by the given columns in descending order. - Notes - ----- - Difference from pandas: - - Only a single column is supported in *columns* - Examples -------- >>> import cudf @@ -3617,6 +3612,11 @@ def nlargest(self, n, columns, keep="first"): France 65000000 2583560 FR Italy 59000000 1937894 IT Brunei 434000 12128 BN + + .. pandas-compat:: + **DataFrame.nlargest** + + - Only a single column is supported in *columns* """ return self._n_largest_or_smallest(True, n, columns, keep) @@ -3643,11 +3643,6 @@ def nsmallest(self, n, columns, keep="first"): ------- DataFrame - Notes - ----- - Difference from pandas: - - Only a single column is supported in *columns* - Examples -------- >>> import cudf @@ -3689,6 +3684,11 @@ def nsmallest(self, n, columns, keep="first"): Anguilla 11300 311 AI Tuvalu 11300 38 TV Nauru 337000 182 NR + + .. pandas-compat:: + **DataFrame.nsmallest** + + - Only a single column is supported in *columns* """ return self._n_largest_or_smallest(False, n, columns, keep) @@ -3766,10 +3766,10 @@ def transpose(self): ------- a new (ncol x nrow) dataframe. self is (nrow x ncol) - Notes - ----- - Difference from pandas: - Not supporting *copy* because default and only behavior is copy=True + .. pandas-compat:: + **DataFrame.transpose, DataFrame.T** + + Not supporting *copy* because default and only behavior is copy=True """ index = self._data.to_pandas_index() @@ -3921,10 +3921,6 @@ def merge( ------- merged : DataFrame - Notes - ----- - **DataFrames merges in cuDF result in non-deterministic row ordering.** - Examples -------- >>> import cudf @@ -3960,6 +3956,11 @@ def merge( right dtype respectively. This extends to semi and anti joins. - For outer joins, the result will be the union of categories from both sides. + + .. pandas-compat:: + **DataFrame.merge** + + **DataFrames merges in cuDF result in non-deterministic row ordering.** """ if indicator: raise NotImplementedError( @@ -4030,12 +4031,11 @@ def join( ------- joined : DataFrame - Notes - ----- - Difference from pandas: + .. pandas-compat:: + **DataFrame.join** - - *other* must be a single DataFrame for now. - - *on* is not supported yet due to lack of multi-index support. + - *other* must be a single DataFrame for now. + - *on* is not supported yet due to lack of multi-index support. """ if on is not None: raise NotImplementedError("The on parameter is not yet supported") @@ -5147,11 +5147,6 @@ def from_arrow(cls, table): ------- cudf DataFrame - Notes - ----- - - Does not support automatically setting index column(s) similar - to how ``to_pandas`` works for PyArrow Tables. - Examples -------- >>> import cudf @@ -5162,6 +5157,12 @@ def from_arrow(cls, table): 0 1 4 1 2 5 2 3 6 + + .. pandas-compat:: + **DataFrame.from_arrow** + + - Does not support automatically setting index column(s) similar + to how ``to_pandas`` works for PyArrow Tables. """ index_col = None if isinstance(table, pa.Table) and isinstance( @@ -5173,13 +5174,21 @@ def from_arrow(cls, table): if index_col: if isinstance(index_col[0], dict): - out = out.set_index( - cudf.RangeIndex( - index_col[0]["start"], - index_col[0]["stop"], - name=index_col[0]["name"], - ) + idx = cudf.RangeIndex( + index_col[0]["start"], + index_col[0]["stop"], + name=index_col[0]["name"], ) + if len(idx) == len(out): + # `idx` is generated from arrow `pandas_metadata` + # which can get out of date with many of the + # arrow operations. Hence verifying if the + # lengths match, or else don't need to set + # an index at all i.e., Default RangeIndex + # will be set. + # See more about the discussion here: + # https://github.com/apache/arrow/issues/15178 + out = out.set_index(idx) else: out = out.set_index(index_col[0]) @@ -5493,14 +5502,6 @@ def quantile( If q is a float, a Series will be returned where the index is the columns of self and the values are the quantiles. - .. pandas-compat:: - **DataFrame.quantile** - - One notable difference from Pandas is when DataFrame is of - non-numeric types and result is expected to be a Series in case of - Pandas. cuDF will return a DataFrame as it doesn't support mixed - types under Series. - Examples -------- >>> import cupy as cp @@ -5521,6 +5522,14 @@ def quantile( a b 0.1 1.3 3.7 0.5 2.5 55.0 + + .. pandas-compat:: + **DataFrame.quantile** + + One notable difference from Pandas is when DataFrame is of + non-numeric types and result is expected to be a Series in case of + Pandas. cuDF will return a DataFrame as it doesn't support mixed + types under Series. """ # noqa: E501 if axis not in (0, None): raise NotImplementedError("axis is not implemented yet") @@ -5786,10 +5795,6 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): Series For each column/row the number of non-NA/null entries. - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -5803,6 +5808,11 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): Age 4 Single 5 dtype: int64 + + .. pandas-compat:: + **DataFrame.count** + + Parameters currently not supported are `axis`, `level`, `numeric_only`. """ axis = self._get_axis_from_axis_arg(axis) if axis != 0: @@ -5950,10 +5960,6 @@ def mode(self, axis=0, numeric_only=False, dropna=True): cudf.Series.value_counts : Return the counts of values in a Series. - Notes - ----- - ``axis`` parameter is currently not supported. - Examples -------- >>> import cudf @@ -5992,6 +5998,11 @@ def mode(self, axis=0, numeric_only=False, dropna=True): legs wings 0 2 0.0 1 2.0 + + .. pandas-compat:: + **DataFrame.mode** + + ``axis`` parameter is currently not supported. """ if axis not in (0, "index"): raise NotImplementedError("Only axis=0 is currently supported") @@ -6530,7 +6541,7 @@ def to_struct(self, name=None): Notes ----- - Note that a copy of the columns is made. + Note: that a copy of the columns is made. """ if not all(isinstance(name, str) for name in self._data.names): warnings.warn( @@ -6635,11 +6646,6 @@ def append( ------- DataFrame - See Also - -------- - cudf.concat : General function to concatenate DataFrame or - objects. - Notes ----- If a list of dict/series is passed and the keys are all contained in @@ -6649,7 +6655,12 @@ def append( computationally intensive than a single concatenate. A better solution is to append those rows to a list and then concatenate the list with the original DataFrame all at once. - `verify_integrity` parameter is not supported yet. + `verify_integrity` parameter is not supported yet + + See Also + -------- + cudf.concat : General function to concatenate DataFrame or + objects. Examples -------- @@ -7027,22 +7038,6 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): DataFrame if any assignment statements are included in ``expr``, or None if ``inplace=True``. - Notes - ----- - Difference from pandas: - * Additional kwargs are not supported. - * Bitwise and logical operators are not dtype-dependent. - Specifically, `&` must be used for bitwise operators on integers, - not `and`, which is specifically for the logical and between - booleans. - * Only numerical types are currently supported. - * Operators generally will not cast automatically. Users are - responsible for casting columns to suitable types before - evaluating a function. - * Multiple assignments to the same name (i.e. a sequence of - assignment statements where later statements are conditioned upon - the output of earlier statements) is not supported. - Examples -------- >>> df = cudf.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) @@ -7104,6 +7099,22 @@ def eval(self, expr: str, inplace: bool = False, **kwargs): 2 3 6 9 -3 3 4 4 8 0 4 5 2 7 3 + + .. pandas-compat:: + **DataFrame.eval** + + * Additional kwargs are not supported. + * Bitwise and logical operators are not dtype-dependent. + Specifically, `&` must be used for bitwise operators on integers, + not `and`, which is specifically for the logical and between + booleans. + * Only numerical types are currently supported. + * Operators generally will not cast automatically. Users are + responsible for casting columns to suitable types before + evaluating a function. + * Multiple assignments to the same name (i.e. a sequence of + assignment statements where later statements are conditioned upon + the output of earlier statements) is not supported. """ if kwargs: raise ValueError( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8b508eac324..35002863065 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -253,11 +253,6 @@ def empty(self): out : bool If DataFrame/Series is empty, return True, if not return False. - Notes - ----- - If DataFrame/Series contains only `null` values, it is still not - considered empty. See the example below. - Examples -------- >>> import cudf @@ -298,6 +293,12 @@ def empty(self): Series([], dtype: float64) >>> s.empty True + + .. pandas-compat:: + **DataFrame.empty, Series.empty, Frame.empty** + + If DataFrame/Series contains only `null` values, it is still not + considered empty. See the example above. """ return self.size == 0 @@ -628,6 +629,8 @@ def where(self, cond, other=None, inplace=False): dtype: int64 .. pandas-compat:: + **DataFrame.where, Series.where, Frame.where** + Note that ``where`` treats missing values as falsy, in parallel with pandas treatment of nullable data: @@ -1908,10 +1911,6 @@ def min( ------- Series - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -1920,6 +1919,11 @@ def min( a 1 b 7 dtype: int64 + + .. pandas-compat:: + **DataFrame.min, Series.min, Frame.min** + + Parameters currently not supported are `level`, `numeric_only`. """ return self._reduce( "min", @@ -1959,10 +1963,6 @@ def max( ------- Series - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -1971,6 +1971,11 @@ def max( a 4 b 10 dtype: int64 + + .. pandas-compat:: + **DataFrame.max, Series.max, Frame.max** + + Parameters currently not supported are `level`, `numeric_only`. """ return self._reduce( "max", @@ -2015,10 +2020,6 @@ def sum( ------- Series - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - Examples -------- >>> import cudf @@ -2027,6 +2028,11 @@ def sum( a 10 b 34 dtype: int64 + + .. pandas-compat:: + **DataFrame.sum, Series.sum, Frame.sum** + + Parameters currently not supported are `level`, `numeric_only`. """ return self._reduce( "sum", @@ -2073,10 +2079,6 @@ def product( ------- Series - Notes - ----- - Parameters currently not supported are level`, `numeric_only`. - Examples -------- >>> import cudf @@ -2085,6 +2087,11 @@ def product( a 24 b 5040 dtype: int64 + + .. pandas-compat:: + **DataFrame.product, Series.product, Frame.product** + + Parameters currently not supported are level`, `numeric_only`. """ axis = self._get_axis_from_axis_arg(axis) return self._reduce( @@ -2179,11 +2186,6 @@ def std( ------- Series - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - Examples -------- >>> import cudf @@ -2192,6 +2194,12 @@ def std( a 1.290994 b 1.290994 dtype: float64 + + .. pandas-compat:: + **DataFrame.std, Series.std, Frame.std** + + Parameters currently not supported are `level` and + `numeric_only` """ return self._reduce( @@ -2235,11 +2243,6 @@ def var( ------- scalar - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - Examples -------- >>> import cudf @@ -2248,6 +2251,12 @@ def var( a 1.666667 b 1.666667 dtype: float64 + + .. pandas-compat:: + **DataFrame.var, Series.var, Frame.var** + + Parameters currently not supported are `level` and + `numeric_only` """ return self._reduce( "var", @@ -2280,10 +2289,6 @@ def kurtosis( ------- Series or scalar - Notes - ----- - Parameters currently not supported are `level` and `numeric_only` - Examples -------- **Series** @@ -2301,6 +2306,11 @@ def kurtosis( a -1.2 b -1.2 dtype: float64 + + .. pandas-compat:: + **DataFrame.kurtosis, Frame.kurtosis** + + Parameters currently not supported are `level` and `numeric_only` """ if axis not in (0, "index", None): raise NotImplementedError("Only axis=0 is currently supported.") @@ -2333,11 +2343,6 @@ def skew( ------- Series - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - Examples -------- **Series** @@ -2362,6 +2367,12 @@ def skew( a 0.00000 b -0.37037 dtype: float64 + + .. pandas-compat:: + **DataFrame.skew, Series.skew, Frame.skew** + + Parameters currently not supported are `axis`, `level` and + `numeric_only` """ if axis not in (0, "index", None): raise NotImplementedError("Only axis=0 is currently supported.") @@ -2393,10 +2404,6 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): ------- Series - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - Examples -------- >>> import cudf @@ -2405,6 +2412,11 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): a True b False dtype: bool + + .. pandas-compat:: + **DataFrame.all, Series.all, Frame.all** + + Parameters currently not supported are `axis`, `bool_only`, `level`. """ return self._reduce( "all", @@ -2432,10 +2444,6 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): ------- Series - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - Examples -------- >>> import cudf @@ -2444,6 +2452,11 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): a True b True dtype: bool + + .. pandas-compat:: + **DataFrame.any, Series.any, Frame.any** + + Parameters currently not supported are `axis`, `bool_only`, `level`. """ return self._reduce( "any", @@ -2498,10 +2511,6 @@ def median( ------- scalar - Notes - ----- - Parameters currently not supported are `level` and `numeric_only`. - Examples -------- >>> import cudf @@ -2516,6 +2525,11 @@ def median( dtype: int64 >>> ser.median() 17.0 + + .. pandas-compat:: + **DataFrame.median, Series.median, Frame.median** + + Parameters currently not supported are `level` and `numeric_only`. """ return self._reduce( "median", diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 91e00eb43f3..2dd2167b32d 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -535,10 +535,10 @@ def _reduce( Series or DataFrame Computed {op} of values within each group. - Notes - ----- - Difference from pandas: - * Not supporting: numeric_only, min_count + .. pandas-compat:: + **{cls}.{op}** + + The numeric_only, min_count """ if numeric_only: raise NotImplementedError( @@ -897,7 +897,7 @@ def mult(df): 6 2 6 12 .. pandas-compat:: - **groupby.apply** + **GroupBy.apply** cuDF's ``groupby.apply`` is limited compared to pandas. In some situations, Pandas returns the grouped keys as part of @@ -1654,7 +1654,7 @@ def fillna( DataFrame or Series .. pandas-compat:: - **groupby.fillna** + **GroupBy.fillna** This function may return result in different format to the method Pandas supports. For example: @@ -1732,9 +1732,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): Series or DataFrame Object shifted within each group. - Notes - ----- - Parameter ``freq`` is unsupported. + .. pandas-compat:: + **GroupBy.shift** + + Parameter ``freq`` is unsupported. """ if freq is not None: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index daf2502ee96..4d8b1b39258 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -585,11 +585,6 @@ def replace( result : Series Series after replacement. The mask and index are preserved. - Notes - ----- - Parameters that are currently not supported are: `limit`, `regex`, - `method` - Examples -------- **Series** @@ -732,6 +727,12 @@ def replace( 2 2 7 c 3 3 8 d 4 4 9 e + + .. pandas-compat:: + **DataFrame.replace, Series.replace, IndexedFrame.replace** + + Parameters that are currently not supported are: `limit`, `regex`, + `method` """ if limit is not None: raise NotImplementedError("limit parameter is not implemented yet") @@ -1063,13 +1064,6 @@ def truncate(self, before=None, after=None, axis=0, copy=True): `before` and `after` may be specified as strings instead of Timestamps. - .. pandas-compat:: - **DataFrame.truncate, Series.truncate** - - The ``copy`` parameter is only present for API compatibility, but - ``copy=False`` is not supported. This method always generates a - copy. - Examples -------- **Series** @@ -1211,6 +1205,13 @@ def truncate(self, before=None, after=None, axis=0, copy=True): 2021-01-01 23:45:25 1 2 2021-01-01 23:45:26 1 2 2021-01-01 23:45:27 1 2 + + .. pandas-compat:: + **DataFrame.truncate, Series.truncate, IndexedFrame.truncate** + + The ``copy`` parameter is only present for API compatibility, but + ``copy=False`` is not supported. This method always generates a + copy. """ if not copy: raise ValueError("Truncating with copy=False is not supported.") @@ -1463,11 +1464,6 @@ def sort_index( ------- Frame or None - Notes - ----- - Difference from pandas: - * Not supporting: kind, sort_remaining=False - Examples -------- **Series** @@ -1508,6 +1504,11 @@ def sort_index( 1 2 3 3 1 2 2 3 1 + + .. pandas-compat:: + **DataFrame.sort_index, Series.sort_index, IndexedFrame.sort_index** + + * Not supporting: kind, sort_remaining=False """ if kind is not None: raise NotImplementedError("kind is not yet supported") @@ -2172,12 +2173,6 @@ def sort_values( ------- Frame : Frame with sorted values. - Notes - ----- - Difference from pandas: - * Support axis='index' only. - * Not supporting: inplace, kind - Examples -------- >>> import cudf @@ -2189,6 +2184,12 @@ def sort_values( 0 0 -3 2 2 0 1 1 2 + + .. pandas-compat:: + **DataFrame.sort_values, Series.sort_values, IndexedFrame.sort_values** + + * Support axis='index' only. + * Not supporting: inplace, kind """ if na_position not in {"first", "last"}: raise ValueError(f"invalid na_position: {na_position}") @@ -2496,7 +2497,11 @@ def round(self, decimals=0, how="half_even"): cols = { name: col.round(decimals[name], how=how) - if (name in decimals and _is_non_decimal_numeric_dtype(col.dtype)) + if ( + name in decimals + and _is_non_decimal_numeric_dtype(col.dtype) + and not is_bool_dtype(col.dtype) + ) else col.copy(deep=True) for name, col in self._data.items() } @@ -2644,13 +2649,14 @@ def resample( 2018-02-28 18.0 63.333333 - Notes - ----- - Note that the dtype of the index (or the 'on' column if using - 'on=') in the result will be of a frequency closest to the - resampled frequency. For example, if resampling from - nanoseconds to milliseconds, the index will be of dtype - 'datetime64[ms]'. + .. pandas-compat:: + **DataFrame.resample, Series.resample, IndexedFrame.resample** + + Note that the dtype of the index (or the 'on' column if using + 'on=') in the result will be of a frequency closest to the + resampled frequency. For example, if resampling from + nanoseconds to milliseconds, the index will be of dtype + 'datetime64[ms]'. """ import cudf.core.resample @@ -3083,18 +3089,6 @@ def sample( provided via the `random_state` parameter. This function will always produce the same sample given an identical `random_state`. - Notes - ----- - When sampling from ``axis=0/'index'``, ``random_state`` can be either - a numpy random state (``numpy.random.RandomState``) or a cupy random - state (``cupy.random.RandomState``). When a numpy random state is - used, the output is guaranteed to match the output of the corresponding - pandas method call, but generating the sample may be slow. If exact - pandas equivalence is not required, using a cupy random state will - achieve better performance, especially when sampling large number of - items. It's advised to use the matching `ndarray` type to the random - state for the `weights` array. - Parameters ---------- n : int, optional @@ -3162,6 +3156,19 @@ def sample( a c 0 1 3 1 2 4 + + .. pandas-compat:: + **DataFrame.sample, Series.sample, IndexedFrame.sample** + + When sampling from ``axis=0/'index'``, ``random_state`` can be either + a numpy random state (``numpy.random.RandomState``) or a cupy random + state (``cupy.random.RandomState``). When a numpy random state is + used, the output is guaranteed to match the output of the corresponding + pandas method call, but generating the sample may be slow. If exact + pandas equivalence is not required, using a cupy random state will + achieve better performance, especially when sampling large number of + items. It's advised to use the matching `ndarray` type to the random + state for the `weights` array. """ axis = self._get_axis_from_axis_arg(axis) size = self.shape[axis] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 1c697a2d824..4b6fde74d06 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1203,10 +1203,11 @@ def map(self, arg, na_action=None) -> "Series": 4 dtype: int64 - Notes - ----- - Please note map currently only supports fixed-width numeric - type functions. + .. pandas-compat:: + **Series.map** + + Please note map currently only supports fixed-width numeric + type functions. """ if isinstance(arg, dict): if hasattr(arg, "__missing__"): @@ -1979,12 +1980,6 @@ def sort_values( ------- Series : Series with sorted values. - Notes - ----- - Difference from pandas: - * Support axis='index' only. - * Not supporting: inplace, kind - Examples -------- >>> import cudf @@ -1996,6 +1991,12 @@ def sort_values( 3 4 1 5 dtype: int64 + + .. pandas-compat:: + **Series.sort_values** + + * Support axis='index' only. + * The inplace and kind argument is currently unsupported """ return super().sort_values( by=self.name, @@ -2443,16 +2444,17 @@ def count(self, level=None, **kwargs): int Number of non-null values in the Series. - Notes - ----- - Parameters currently not supported is `level`. - Examples -------- >>> import cudf >>> ser = cudf.Series([1, 5, 2, 4, 3]) >>> ser.count() 5 + + .. pandas-compat:: + **Series.count** + + Parameters currently not supported is `level`. """ if level is not None: @@ -2554,10 +2556,6 @@ def cov(self, other, min_periods=None): Covariance between Series and other normalized by N-1 (unbiased estimator). - Notes - ----- - `min_periods` parameter is not yet supported. - Examples -------- >>> import cudf @@ -2565,6 +2563,11 @@ def cov(self, other, min_periods=None): >>> ser2 = cudf.Series([0.12, 0.26, 0.51]) >>> ser1.cov(ser2) -0.015750000000000004 + + .. pandas-compat:: + **Series.cov** + + `min_periods` parameter is not yet supported. """ if min_periods is not None: @@ -3285,12 +3288,6 @@ def rename(self, index=None, copy=True): ------- Series - Notes - ----- - Difference from pandas: - - Supports scalar values only for changing name attribute - - Not supporting : inplace, level - Examples -------- >>> import cudf @@ -3309,6 +3306,12 @@ def rename(self, index=None, copy=True): Name: numeric_series, dtype: int64 >>> renamed_series.name 'numeric_series' + + .. pandas-compat:: + **Series.rename** + + - Supports scalar values only for changing name attribute + - The ``inplace`` and ``level`` is not supported """ out_data = self._data.copy(deep=copy) return Series._from_data(out_data, self.index, name=index) @@ -4488,11 +4491,6 @@ def strftime(self, date_format, *args, **kwargs): Series Series of formatted strings. - Notes - ----- - The following date format identifiers are not yet - supported: ``%c``, ``%x``,``%X`` - Examples -------- >>> import cudf @@ -4519,6 +4517,12 @@ def strftime(self, date_format, *args, **kwargs): 1 2000 / 30 / 06 2 2000 / 30 / 09 dtype: object + + .. pandas-compat:: + **series.DatetimeProperties.strftime** + + The following date format identifiers are not yet + supported: ``%c``, ``%x``,``%X`` """ if not isinstance(date_format, str): diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 92ef49e92d9..472aee9c516 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import math import re @@ -808,7 +808,6 @@ def date_range( '2023-12-23 08:00:00', '2025-02-23 08:00:00', '2026-04-23 08:00:00'], dtype='datetime64[ns]') - """ if tz is not None: raise NotImplementedError("tz is currently unsupported.") diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 0273227010b..609a5503040 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. import warnings @@ -56,13 +56,6 @@ def to_numeric(arg, errors="raise", downcast=None): Depending on the input, if series is passed in, series is returned, otherwise ndarray - Notes - ----- - An important difference from pandas is that this function does not accept - mixed numeric/non-numeric type sequences. For example ``[1, 'a']``. - A ``TypeError`` will be raised when such input is received, regardless of - ``errors`` parameter. - Examples -------- >>> s = cudf.Series(['1', '2.0', '3e3']) @@ -92,6 +85,15 @@ def to_numeric(arg, errors="raise", downcast=None): 1 1.0 2 3000.0 dtype: float64 + + .. pandas-compat:: + **cudf.to_numeric** + + An important difference from pandas is that this function does not accept + mixed numeric/non-numeric type sequences. For example ``[1, 'a']``. + A ``TypeError`` will be raised when such input is received, regardless of + ``errors`` parameter. + """ if errors not in {"raise", "ignore", "coerce"}: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 65e24c7c704..09b9f57356c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3813,17 +3813,22 @@ def test_ndim(): -3, 0, 5, - pd.Series([1, 4, 3, -6], index=["w", "x", "y", "z"]), - cudf.Series([-4, -2, 12], index=["x", "y", "z"]), - {"w": -1, "x": 15, "y": 2}, + pd.Series( + [1, 4, 3, -6], + index=["floats", "ints", "floats_with_nan", "floats_same"], + ), + cudf.Series( + [-4, -2, 12], index=["ints", "floats_with_nan", "floats_same"] + ), + {"floats": -1, "ints": 15, "floats_will_nan": 2}, ], ) def test_dataframe_round(decimals): - pdf = pd.DataFrame( + gdf = cudf.DataFrame( { - "w": np.arange(0.5, 10.5, 1), - "x": np.random.normal(-100, 100, 10), - "y": np.array( + "floats": np.arange(0.5, 10.5, 1), + "ints": np.random.normal(-100, 100, 10), + "floats_with_na": np.array( [ 14.123, 2.343, @@ -3832,31 +3837,25 @@ def test_dataframe_round(decimals): -8.302, np.nan, 94.313, - -112.236, + None, -8.029, np.nan, ] ), - "z": np.repeat([-0.6459412758761901], 10), + "floats_same": np.repeat([-0.6459412758761901], 10), + "bools": np.random.choice([True, None, False], 10), + "strings": np.random.choice(["abc", "xyz", None], 10), + "struct": np.random.choice([{"abc": 1}, {"xyz": 2}, None], 10), + "list": [[1], [2], None, [4], [3]] * 2, } ) - gdf = cudf.DataFrame.from_pandas(pdf) + pdf = gdf.to_pandas() if isinstance(decimals, cudf.Series): pdecimals = decimals.to_pandas() else: pdecimals = decimals - result = gdf.round(decimals) - expected = pdf.round(pdecimals) - assert_eq(result, expected) - - # with nulls, maintaining existing null mask - for c in pdf.columns: - arr = pdf[c].to_numpy().astype("float64") # for pandas nulls - arr.ravel()[np.random.choice(10, 5, replace=False)] = np.nan - pdf[c] = gdf[c] = arr - result = gdf.round(decimals) expected = pdf.round(pdecimals) @@ -10012,3 +10011,17 @@ def test_dataframe_transpose_complex_types(data): actual = gdf.T assert_eq(expected, actual) + + +def test_dataframe_from_arrow_slice(): + table = pa.Table.from_pandas( + pd.DataFrame.from_dict( + {"a": ["aa", "bb", "cc"] * 3, "b": [1, 2, 3] * 3} + ) + ) + table_slice = table.slice(3, 7) + + expected = table_slice.to_pandas() + actual = cudf.DataFrame.from_arrow(table_slice) + + assert_eq(expected, actual) diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/accessors.py index 1c21fca51c8..873981b24f9 100644 --- a/python/dask_cudf/dask_cudf/accessors.py +++ b/python/dask_cudf/dask_cudf/accessors.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. class StructMethods: @@ -263,11 +263,6 @@ def sort_values( ------- ListColumn with each list sorted - Notes - ----- - Difference from pandas: - * Not supporting: `inplace`, `kind` - Examples -------- >>> s = cudf.Series([[4, 2, None, 9], [8, 8, 2], [2, 1]]) @@ -277,6 +272,11 @@ def sort_values( 1 [2.0, 8.0, 8.0] 2 [1.0, 2.0] dtype: list + + .. pandas-compat:: + **ListMethods.sort_values** + + The `inplace` and `kind` argument is currently unsupported. """ return self.d_series.map_partitions( lambda s: s.list.sort_values( diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index bd398cb9607..1e3ff63ce76 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. import warnings from contextlib import ExitStack from functools import partial @@ -230,7 +230,8 @@ def read_partition( **read_kwargs, ) ) - paths = rgs = [] + paths = [] + rgs = [] last_partition_keys = None paths.append(path) rgs.append(