Skip to content

Commit

Permalink
Update missing docstring examples in python public APIs (#7546)
Browse files Browse the repository at this point in the history
Fully resolves: #5290 

This PR:

- [x] Adds missing docstring examples in python public APIs.
- [x] Adds some missing alias APIs.
- [x] Fixes issue in `Series.take` where the index was not correctly being removed when `keep_index=False`.
- [x] **Removes** `Series.values_to_string`, this API seems to be have been touched 4-years ago and since we have removed support for iterating over GPU objects thought it is best to remove this API altogether.

Authors:
  - GALI PREM SAGAR (@galipremsagar)

Approvers:
  - Keith Kraus (@kkraus14)

URL: #7546
  • Loading branch information
galipremsagar authored Mar 10, 2021
1 parent 850548d commit 2e4b5a6
Show file tree
Hide file tree
Showing 4 changed files with 1,845 additions and 57 deletions.
301 changes: 298 additions & 3 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,32 @@ def deserialize(cls, header, frames):

@property
def dtypes(self):
"""Return the dtypes in this object."""
"""
Return the dtypes in this object.
Returns
-------
pandas.Series
The data type of each column.
Examples
--------
>>> import cudf
>>> import pandas as pd
>>> df = cudf.DataFrame({'float': [1.0],
... 'int': [1],
... 'datetime': [pd.Timestamp('20180310')],
... 'string': ['foo']})
>>> df
float int datetime string
0 1.0 1 2018-03-10 foo
>>> df.dtypes
float float64
int int64
datetime datetime64[us]
string object
dtype: object
"""
return cudf.utils.utils._create_pandas_series(
data=[x.dtype for x in self._data.columns], index=self._data.names,
)
Expand Down Expand Up @@ -1133,6 +1158,39 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs):
Returns
-------
casted : DataFrame
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a': [10, 20, 30], 'b': [1, 2, 3]})
>>> df
a b
0 10 1
1 20 2
2 30 3
>>> df.dtypes
a int64
b int64
dtype: object
Cast all columns to `int32`:
>>> df.astype('int32').dtypes
a int32
b int32
dtype: object
Cast `a` to `float32` using a dictionary:
>>> df.astype({'a': 'float32'}).dtypes
a float32
b int64
dtype: object
>>> df.astype({'a': 'float32'})
a b
0 10.0 1
1 20.0 2
2 30.0 3
"""
result = DataFrame(index=self.index)

Expand Down Expand Up @@ -3360,7 +3418,71 @@ def drop_duplicates(
"""
Return DataFrame with duplicate rows removed, optionally only
considering certain subset of columns.
"""
Parameters
----------
subset : column label or sequence of labels, optional
Only consider certain columns for identifying duplicates, by
default use all of the columns.
keep : {'first', 'last', False}, default 'first'
Determines which duplicates (if any) to keep.
- ``first`` : Drop duplicates except for the first occurrence.
- ``last`` : Drop duplicates except for the last occurrence.
- False : Drop all duplicates.
inplace : bool, default False
Whether to drop duplicates in place or to return a copy.
ignore_index : bool, default False
If True, the resulting axis will be labeled 0, 1, …, n - 1.
Returns
-------
DataFrame or None
DataFrame with duplicates removed or None if ``inplace=True``.
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({
... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
... 'rating': [4, 4, 3.5, 15, 5]
... })
>>> df
brand style rating
0 Yum Yum cup 4.0
1 Yum Yum cup 4.0
2 Indomie cup 3.5
3 Indomie pack 15.0
4 Indomie pack 5.0
By default, it removes duplicate rows based
on all columns. Note that order of
the rows being returned is not guaranteed
to be sorted.
>>> df.drop_duplicates()
brand style rating
2 Indomie cup 3.5
4 Indomie pack 5.0
3 Indomie pack 15.0
0 Yum Yum cup 4.0
To remove duplicates on specific column(s),
use `subset`.
>>> df.drop_duplicates(subset=['brand'])
brand style rating
2 Indomie cup 3.5
0 Yum Yum cup 4.0
To remove duplicates and keep last occurrences, use `keep`.
>>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
brand style rating
2 Indomie cup 3.5
4 Indomie pack 5.0
1 Yum Yum cup 4.0
""" # noqa: E501
outdf = super().drop_duplicates(
subset=subset, keep=keep, ignore_index=ignore_index
)
Expand Down Expand Up @@ -3439,6 +3561,32 @@ def rename(
Rename will not overwite column names. If a list with duplicates is
passed, column names will be postfixed with a number.
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
>>> df
A B
0 1 4
1 2 5
2 3 6
Rename columns using a mapping:
>>> df.rename(columns={"A": "a", "B": "c"})
a c
0 1 4
1 2 5
2 3 6
Rename index using a mapping:
>>> df.rename(index={0: 10, 1: 20, 2: 30})
A B
10 1 4
20 2 5
30 3 6
"""
if errors != "ignore":
raise NotImplementedError(
Expand Down Expand Up @@ -3663,6 +3811,21 @@ def label_encoding(
Returns
-------
a new dataframe with a new column append for the coded values.
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 10, 20]})
>>> df
a b
0 1 10
1 2 10
2 3 20
>>> df.label_encoding(column="b", prefix="b_col", cats=[10, 20])
a b b_col_labels
0 1 10 0
1 2 10 0
2 3 20 1
"""

newname = prefix_sep.join([prefix, "labels"])
Expand Down Expand Up @@ -3992,20 +4155,131 @@ def agg(self, aggs, axis=None):
def nlargest(self, n, columns, keep="first"):
"""Get the rows of the DataFrame sorted by the n largest value of *columns*
Parameters
----------
n : int
Number of rows to return.
columns : label or list of labels
Column label(s) to order by.
keep : {'first', 'last'}, default 'first'
Where there are duplicate values:
- `first` : prioritize the first occurrence(s)
- `last` : prioritize the last occurrence(s)
Returns
-------
DataFrame
The first `n` rows ordered by the given columns in descending
order.
Notes
-----
Difference from pandas:
- Only a single column is supported in *columns*
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'population': [59000000, 65000000, 434000,
... 434000, 434000, 337000, 11300,
... 11300, 11300],
... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
... 17036, 182, 38, 311],
... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
... "IS", "NR", "TV", "AI"]},
... index=["Italy", "France", "Malta",
... "Maldives", "Brunei", "Iceland",
... "Nauru", "Tuvalu", "Anguilla"])
>>> df
population GDP alpha-2
Italy 59000000 1937894 IT
France 65000000 2583560 FR
Malta 434000 12011 MT
Maldives 434000 4520 MV
Brunei 434000 12128 BN
Iceland 337000 17036 IS
Nauru 11300 182 NR
Tuvalu 11300 38 TV
Anguilla 11300 311 AI
>>> df.nlargest(3, 'population')
population GDP alpha-2
France 65000000 2583560 FR
Italy 59000000 1937894 IT
Malta 434000 12011 MT
>>> df.nlargest(3, 'population', keep='last')
population GDP alpha-2
France 65000000 2583560 FR
Italy 59000000 1937894 IT
Brunei 434000 12128 BN
"""
return self._n_largest_or_smallest("nlargest", n, columns, keep)

def nsmallest(self, n, columns, keep="first"):
"""Get the rows of the DataFrame sorted by the n smallest value of *columns*
Parameters
----------
n : int
Number of items to retrieve.
columns : list or str
Column name or names to order by.
keep : {'first', 'last'}, default 'first'
Where there are duplicate values:
- ``first`` : take the first occurrence.
- ``last`` : take the last occurrence.
Returns
-------
DataFrame
Notes
-----
Difference from pandas:
- Only a single column is supported in *columns*
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'population': [59000000, 65000000, 434000,
... 434000, 434000, 337000, 337000,
... 11300, 11300],
... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
... 17036, 182, 38, 311],
... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
... "IS", "NR", "TV", "AI"]},
... index=["Italy", "France", "Malta",
... "Maldives", "Brunei", "Iceland",
... "Nauru", "Tuvalu", "Anguilla"])
>>> df
population GDP alpha-2
Italy 59000000 1937894 IT
France 65000000 2583560 FR
Malta 434000 12011 MT
Maldives 434000 4520 MV
Brunei 434000 12128 BN
Iceland 337000 17036 IS
Nauru 337000 182 NR
Tuvalu 11300 38 TV
Anguilla 11300 311 AI
In the following example, we will use ``nsmallest`` to select the
three rows having the smallest values in column "population".
>>> df.nsmallest(3, 'population')
population GDP alpha-2
Tuvalu 11300 38 TV
Anguilla 11300 311 AI
Iceland 337000 17036 IS
When using ``keep='last'``, ties are resolved in reverse order:
>>> df.nsmallest(3, 'population', keep='last')
population GDP alpha-2
Anguilla 11300 311 AI
Tuvalu 11300 38 TV
Nauru 337000 182 NR
"""
return self._n_largest_or_smallest("nsmallest", n, columns, keep)

Expand Down Expand Up @@ -5608,7 +5882,28 @@ def quantile(
non-numeric types and result is expected to be a Series in case of
Pandas. cuDF will return a DataFrame as it doesn't support mixed
types under Series.
"""
Examples
--------
>>> import cupy as cp
>>> import cudf
>>> df = cudf.DataFrame(cp.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
... columns=['a', 'b'])
>>> df
a b
0 1 1
1 2 10
2 3 100
3 4 100
>>> df.quantile(0.1)
a 1.3
b 3.7
Name: 0.1, dtype: float64
>>> df.quantile([.1, .5])
a b
0.1 1.3 3.7
0.5 2.5 55.0
""" # noqa: E501
if axis not in (0, None):
raise NotImplementedError("axis is not implemented yet")

Expand Down
Loading

0 comments on commit 2e4b5a6

Please sign in to comment.