Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Update missing docstring examples in python public APIs #7546

Merged
merged 8 commits into from
Mar 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
301 changes: 298 additions & 3 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,32 @@ def deserialize(cls, header, frames):

@property
def dtypes(self):
"""Return the dtypes in this object."""
"""
Return the dtypes in this object.

Returns
-------
pandas.Series
The data type of each column.

Examples
--------
>>> import cudf
>>> import pandas as pd
>>> df = cudf.DataFrame({'float': [1.0],
... 'int': [1],
... 'datetime': [pd.Timestamp('20180310')],
... 'string': ['foo']})
>>> df
float int datetime string
0 1.0 1 2018-03-10 foo
>>> df.dtypes
float float64
int int64
datetime datetime64[us]
string object
dtype: object
"""
return cudf.utils.utils._create_pandas_series(
data=[x.dtype for x in self._data.columns], index=self._data.names,
)
Expand Down Expand Up @@ -1133,6 +1158,39 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):
Returns
-------
casted : DataFrame

Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a': [10, 20, 30], 'b': [1, 2, 3]})
>>> df
a b
0 10 1
1 20 2
2 30 3
>>> df.dtypes
a int64
b int64
dtype: object

Cast all columns to `int32`:

>>> df.astype('int32').dtypes
a int32
b int32
dtype: object

Cast `a` to `float32` using a dictionary:

>>> df.astype({'a': 'float32'}).dtypes
a float32
b int64
dtype: object
>>> df.astype({'a': 'float32'})
a b
0 10.0 1
1 20.0 2
2 30.0 3
"""
result = DataFrame(index=self.index)

Expand Down Expand Up @@ -3360,7 +3418,71 @@ def drop_duplicates(
"""
Return DataFrame with duplicate rows removed, optionally only
considering certain subset of columns.
"""

Parameters
----------
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns.
keep : {'first', 'last', False}, default 'first'
Determines which duplicates (if any) to keep.
- ``first`` : Drop duplicates except for the first occurrence.
- ``last`` : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.
inplace : bool, default False
Whether to drop duplicates in place or to return a copy.
ignore_index : bool, default False
If True, the resulting axis will be labeled 0, 1, …, n - 1.

Returns
-------
DataFrame or None
DataFrame with duplicates removed or None if ``inplace=True``.

Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({
... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
... 'rating': [4, 4, 3.5, 15, 5]
... })
>>> df
brand style rating
0 Yum Yum cup 4.0
1 Yum Yum cup 4.0
2 Indomie cup 3.5
3 Indomie pack 15.0
4 Indomie pack 5.0

By default, it removes duplicate rows based
on all columns. Note that order of
the rows being returned is not guaranteed
to be sorted.

>>> df.drop_duplicates()
brand style rating
2 Indomie cup 3.5
4 Indomie pack 5.0
3 Indomie pack 15.0
0 Yum Yum cup 4.0

To remove duplicates on specific column(s),
use `subset`.

>>> df.drop_duplicates(subset=['brand'])
brand style rating
2 Indomie cup 3.5
0 Yum Yum cup 4.0

To remove duplicates and keep last occurrences, use `keep`.

>>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
brand style rating
2 Indomie cup 3.5
4 Indomie pack 5.0
1 Yum Yum cup 4.0
""" # noqa: E501
outdf = super().drop_duplicates(
subset=subset, keep=keep, ignore_index=ignore_index
)
Expand Down Expand Up @@ -3439,6 +3561,32 @@ def rename(

Rename will not overwite column names. If a list with duplicates is
passed, column names will be postfixed with a number.

Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
>>> df
A B
0 1 4
1 2 5
2 3 6

Rename columns using a mapping:

>>> df.rename(columns={"A": "a", "B": "c"})
a c
0 1 4
1 2 5
2 3 6

Rename index using a mapping:

>>> df.rename(index={0: 10, 1: 20, 2: 30})
A B
10 1 4
20 2 5
30 3 6
"""
if errors != "ignore":
raise NotImplementedError(
Expand Down Expand Up @@ -3663,6 +3811,21 @@ def label_encoding(
Returns
-------
a new dataframe with a new column append for the coded values.

Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 10, 20]})
>>> df
a b
0 1 10
1 2 10
2 3 20
>>> df.label_encoding(column="b", prefix="b_col", cats=[10, 20])
a b b_col_labels
0 1 10 0
1 2 10 0
2 3 20 1
"""

newname = prefix_sep.join([prefix, "labels"])
Expand Down Expand Up @@ -3992,20 +4155,131 @@ def agg(self, aggs, axis=None):
def nlargest(self, n, columns, keep="first"):
"""Get the rows of the DataFrame sorted by the n largest value of *columns*

Parameters
----------
n : int
Number of rows to return.
columns : label or list of labels
Column label(s) to order by.
keep : {'first', 'last'}, default 'first'
Where there are duplicate values:

- `first` : prioritize the first occurrence(s)
- `last` : prioritize the last occurrence(s)

Returns
-------
DataFrame
The first `n` rows ordered by the given columns in descending
order.

Notes
-----
Difference from pandas:
- Only a single column is supported in *columns*

Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'population': [59000000, 65000000, 434000,
... 434000, 434000, 337000, 11300,
... 11300, 11300],
... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
... 17036, 182, 38, 311],
... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
... "IS", "NR", "TV", "AI"]},
... index=["Italy", "France", "Malta",
... "Maldives", "Brunei", "Iceland",
... "Nauru", "Tuvalu", "Anguilla"])
>>> df
population GDP alpha-2
Italy 59000000 1937894 IT
France 65000000 2583560 FR
Malta 434000 12011 MT
Maldives 434000 4520 MV
Brunei 434000 12128 BN
Iceland 337000 17036 IS
Nauru 11300 182 NR
Tuvalu 11300 38 TV
Anguilla 11300 311 AI
>>> df.nlargest(3, 'population')
population GDP alpha-2
France 65000000 2583560 FR
Italy 59000000 1937894 IT
Malta 434000 12011 MT
>>> df.nlargest(3, 'population', keep='last')
population GDP alpha-2
France 65000000 2583560 FR
Italy 59000000 1937894 IT
Brunei 434000 12128 BN
"""
return self._n_largest_or_smallest("nlargest", n, columns, keep)

def nsmallest(self, n, columns, keep="first"):
"""Get the rows of the DataFrame sorted by the n smallest value of *columns*

Parameters
----------
n : int
Number of items to retrieve.
columns : list or str
Column name or names to order by.
keep : {'first', 'last'}, default 'first'
Where there are duplicate values:

- ``first`` : take the first occurrence.
- ``last`` : take the last occurrence.

Returns
-------
DataFrame

Notes
-----
Difference from pandas:
- Only a single column is supported in *columns*

Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'population': [59000000, 65000000, 434000,
... 434000, 434000, 337000, 337000,
... 11300, 11300],
... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
... 17036, 182, 38, 311],
... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
... "IS", "NR", "TV", "AI"]},
... index=["Italy", "France", "Malta",
... "Maldives", "Brunei", "Iceland",
... "Nauru", "Tuvalu", "Anguilla"])
>>> df
population GDP alpha-2
Italy 59000000 1937894 IT
France 65000000 2583560 FR
Malta 434000 12011 MT
Maldives 434000 4520 MV
Brunei 434000 12128 BN
Iceland 337000 17036 IS
Nauru 337000 182 NR
Tuvalu 11300 38 TV
Anguilla 11300 311 AI

In the following example, we will use ``nsmallest`` to select the
three rows having the smallest values in column "population".

>>> df.nsmallest(3, 'population')
population GDP alpha-2
Tuvalu 11300 38 TV
Anguilla 11300 311 AI
Iceland 337000 17036 IS

When using ``keep='last'``, ties are resolved in reverse order:

>>> df.nsmallest(3, 'population', keep='last')
population GDP alpha-2
Anguilla 11300 311 AI
Tuvalu 11300 38 TV
Nauru 337000 182 NR
"""
return self._n_largest_or_smallest("nsmallest", n, columns, keep)

Expand Down Expand Up @@ -5608,7 +5882,28 @@ def quantile(
non-numeric types and result is expected to be a Series in case of
Pandas. cuDF will return a DataFrame as it doesn't support mixed
types under Series.
"""

Examples
--------
>>> import cupy as cp
>>> import cudf
>>> df = cudf.DataFrame(cp.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
... columns=['a', 'b'])
>>> df
a b
0 1 1
1 2 10
2 3 100
3 4 100
>>> df.quantile(0.1)
a 1.3
b 3.7
Name: 0.1, dtype: float64
>>> df.quantile([.1, .5])
a b
0.1 1.3 3.7
0.5 2.5 55.0
""" # noqa: E501
if axis not in (0, None):
raise NotImplementedError("axis is not implemented yet")

Expand Down
Loading