From fb1ea57c3e59152658eb15f08fc2477a2b6e2deb Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 9 Mar 2021 08:17:44 -0800 Subject: [PATCH 1/6] add docs to some apis --- python/cudf/cudf/core/dataframe.py | 126 ++++++++++- python/cudf/cudf/core/index.py | 29 ++- python/cudf/cudf/core/series.py | 326 ++++++++++++++++++++++++++++- 3 files changed, 471 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 18a7f052d62..8849c1b55a2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -583,7 +583,32 @@ def deserialize(cls, header, frames): @property def dtypes(self): - """Return the dtypes in this object.""" + """ + Return the dtypes in this object. + + Returns + ------- + pandas.Series + The data type of each column. + + Examples + -------- + >>> import cudf + >>> import pandas as pd + >>> df = cudf.DataFrame({'float': [1.0], + ... 'int': [1], + ... 'datetime': [pd.Timestamp('20180310')], + ... 'string': ['foo']}) + >>> df + float int datetime string + 0 1.0 1 2018-03-10 foo + >>> df.dtypes + float float64 + int int64 + datetime datetime64[us] + string object + dtype: object + """ return cudf.utils.utils._create_pandas_series( data=[x.dtype for x in self._data.columns], index=self._data.names, ) @@ -1133,6 +1158,39 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): Returns ------- casted : DataFrame + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [10, 20, 30], 'b': [1, 2, 3]}) + >>> df + a b + 0 10 1 + 1 20 2 + 2 30 3 + >>> df.dtypes + a int64 + b int64 + dtype: object + + Cast all columns to `int32`: + + >>> df.astype('int32').dtypes + a int32 + b int32 + dtype: object + + Cast `a` to `float32` using a dictionary: + + >>> df.astype({'a': 'float32'}).dtypes + a float32 + b int64 + dtype: object + >>> df.astype({'a': 'float32'}) + a b + 0 10.0 1 + 1 20.0 2 + 2 30.0 3 """ result = DataFrame(index=self.index) @@ -3360,7 +3418,71 @@ def drop_duplicates( """ Return DataFrame with duplicate rows removed, optionally only considering certain subset of columns. - """ + + Parameters + ---------- + subset : column label or sequence of labels, optional + Only consider certain columns for identifying duplicates, by + default use all of the columns. + keep : {'first', 'last', False}, default 'first' + Determines which duplicates (if any) to keep. + - ``first`` : Drop duplicates except for the first occurrence. + - ``last`` : Drop duplicates except for the last occurrence. + - False : Drop all duplicates. + inplace : bool, default False + Whether to drop duplicates in place or to return a copy. + ignore_index : bool, default False + If True, the resulting axis will be labeled 0, 1, …, n - 1. + + Returns + ------- + DataFrame or None + DataFrame with duplicates removed or None if ``inplace=True``. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({ + ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], + ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], + ... 'rating': [4, 4, 3.5, 15, 5] + ... }) + >>> df + brand style rating + 0 Yum Yum cup 4.0 + 1 Yum Yum cup 4.0 + 2 Indomie cup 3.5 + 3 Indomie pack 15.0 + 4 Indomie pack 5.0 + + By default, it removes duplicate rows based + on all columns. Note that order of + the rows being returned is not guaranteed + to be sorted. + + >>> df.drop_duplicates() + brand style rating + 2 Indomie cup 3.5 + 4 Indomie pack 5.0 + 3 Indomie pack 15.0 + 0 Yum Yum cup 4.0 + + To remove duplicates on specific column(s), + use `subset`. + + >>> df.drop_duplicates(subset=['brand']) + brand style rating + 2 Indomie cup 3.5 + 0 Yum Yum cup 4.0 + + To remove duplicates and keep last occurrences, use `keep`. + + >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') + brand style rating + 2 Indomie cup 3.5 + 4 Indomie pack 5.0 + 1 Yum Yum cup 4.0 + """ # noqa: E501 outdf = super().drop_duplicates( subset=subset, keep=keep, ignore_index=ignore_index ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 8c86352b2a7..f3b07a40acb 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -156,7 +156,16 @@ def drop_duplicates(self, keep="first"): Returns ------- deduplicated : Index - """ + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) + >>> idx + StringIndex(['lama' 'cow' 'lama' 'beetle' 'lama' 'hippo'], dtype='object') + >>> idx.drop_duplicates() + StringIndex(['beetle' 'cow' 'hippo' 'lama'], dtype='object') + """ # noqa: E501 return super().drop_duplicates(keep=keep) @property @@ -1198,6 +1207,15 @@ def astype(self, dtype, copy=False): ------- Index Index with values cast to specified dtype. + + Examples + -------- + >>> import cudf + >>> index = cudf.Index([1, 2, 3]) + >>> index + Int64Index([1, 2, 3], dtype='int64') + >>> index.astype('float64') + Float64Index([1.0, 2.0, 3.0], dtype='float64') """ if pd.api.types.is_dtype_equal(dtype, self.dtype): return self.copy(deep=copy) @@ -1290,6 +1308,15 @@ def empty(self): ------- out : bool If Index is empty, return True, if not return False. + + Examples + -------- + >>> import cudf + >>> index = cudf.Index([]) + >>> index + Float64Index([], dtype='float64') + >>> index.empty + True """ return not self.size diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index f80c6a9b452..0c9da777bd8 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1189,7 +1189,8 @@ def _binaryop(self, other, fn, fill_value=None, reflect=False): return result def add(self, other, fill_value=None, axis=0): - """Addition of series and other, element-wise + """ + Addition of series and other, element-wise (binary operator add). Parameters @@ -1198,6 +1199,43 @@ def add(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null + + Returns + ------- + Series + The result of the addition. + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([1, 1, 1, None], index=['a', 'b', 'c', 'd']) + >>> a + a 1 + b 1 + c 1 + d + dtype: int64 + >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e']) + >>> b + a 1 + b + d 1 + e + dtype: int64 + >>> a.add(b) + a 2 + b + c + d + e + dtype: int64 + >>> a.add(b, fill_value=0) + a 2 + b 1 + c 1 + d 1 + e + dtype: int64 """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") @@ -1867,7 +1905,71 @@ def dropna(self, axis=0, inplace=False, how=None): def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): """ - Return Series with duplicate values removed + Return Series with duplicate values removed. + + Parameters + ---------- + keep : {'first', 'last', ``False``}, default 'first' + Method to handle dropping duplicates: + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. + + inplace : bool, default ``False`` + If ``True``, performs operation inplace and returns None. + + Returns + ------- + Series or None + Series with duplicates dropped or None if ``inplace=True``. + + Examples + -------- + >>> s = cudf.Series(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'], + ... name='animal') + >>> s + 0 lama + 1 cow + 2 lama + 3 beetle + 4 lama + 5 hippo + Name: animal, dtype: object + + With the `keep` parameter, the selection behaviour of duplicated + values can be changed. The value ‘first’ keeps the first + occurrence for each set of duplicated entries. + The default value of keep is ‘first’. Note that order of + the rows being returned is not guaranteed + to be sorted. + + >>> s.drop_duplicates() + 3 beetle + 1 cow + 5 hippo + 0 lama + Name: animal, dtype: object + + The value ‘last’ for parameter `keep` keeps the last occurrence + for each set of duplicated entries. + + >>> s.drop_duplicates(keep='last') + 3 beetle + 1 cow + 5 hippo + 4 lama + Name: animal, dtype: object + + The value `False` for parameter `keep` discards all sets + of duplicated entries. Setting the value of ‘inplace’ to + `True` performs the operation inplace and returns `None`. + + >>> s.drop_duplicates(keep=False, inplace=True) + >>> s + 3 beetle + 1 cow + 5 hippo + Name: animal, dtype: object """ result = super().drop_duplicates(keep=keep, ignore_index=ignore_index) @@ -2110,7 +2212,27 @@ def to_pandas(self, index=True, nullable=False, **kwargs): @property def data(self): """The gpu buffer for the data - """ + + Returns + ------- + out : The GPU buffer of the Series. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4]) + >>> series + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + >>> series.data + + >>> series.data.to_host_array() + array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, + 0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8) + """ # noqa: E501 return self._column.data @property @@ -2157,7 +2279,26 @@ def as_mask(self): Returns ------- device array - """ + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([True, False, True]) + >>> s.as_mask() + + >>> s.as_mask().to_host_array() + array([ 5, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 181, 164, + 188, 1, 0, 0, 0, 0, 255, 255, 255, 255, 255, 255, 255, + 127, 253, 214, 62, 241, 1, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + dtype=uint8) + """ + if not pd.api.type.is_bool_dtype(self.dtype): + raise TypeError( + f"Series must of boolean dtype, found: {self.dtype}" + ) + return self._column.as_mask() def astype(self, dtype, copy=False, errors="raise"): @@ -2188,6 +2329,52 @@ def astype(self, dtype, copy=False, errors="raise"): out : Series Returns ``self.copy(deep=copy)`` if ``dtype`` is the same as ``self.dtype``. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([1, 2], dtype='int32') + >>> series + 0 1 + 1 2 + dtype: int32 + >>> series.astype('int64') + 0 1 + 1 2 + dtype: int64 + + Convert to categorical type: + + >>> series.astype('category') + 0 1 + 1 2 + dtype: category + Categories (2, int64): [1, 2] + + Convert to ordered categorical type with custom ordering: + + >>> cat_dtype = cudf.CategoricalDtype(categories=[2, 1], ordered=True) + >>> series.astype(cat_dtype) + 0 1 + 1 2 + dtype: category + Categories (2, int64): [2 < 1] + + Note that using `copy=False`(enabled by default) + and changing data on a new Series will + propagate changes: + + >>> s1 = cudf.Series([1, 2]) + >>> s1 + 0 1 + 1 2 + dtype: int64 + >>> s2 = s1.astype('int64', copy=False) + >>> s2[0] = 10 + >>> s1 + 0 10 + 1 2 + dtype: int64 """ if errors not in ("ignore", "raise", "warn"): raise ValueError("invalid error value specified") @@ -2229,6 +2416,26 @@ def argsort(self, ascending=True, na_position="last"): Returns ------- result: Series + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([3, 1, 2]) + >>> s + 0 3 + 1 1 + 2 2 + dtype: int64 + >>> s.argsort() + 0 1 + 1 2 + 2 0 + dtype: int32 + >>> s[s.argsort()] + 1 1 + 2 2 + 0 3 + dtype: int64 """ return self._sort(ascending=ascending, na_position=na_position)[1] @@ -3984,7 +4191,27 @@ def scale(self): def abs(self): """Absolute value of each element of the series. - Returns a new Series. + Returns + ------- + abs + Series containing the absolute value of each element. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([-1.10, 2, -3.33, 4]) + >>> series + 0 -1.10 + 1 2.00 + 2 -3.33 + 3 4.00 + dtype: float64 + >>> series.abs() + 0 1.10 + 1 2.00 + 2 3.33 + 3 4.00 + dtype: float64 """ return self._unaryop("abs") @@ -3993,10 +4220,31 @@ def __abs__(self): # Rounding def ceil(self): - """Rounds each value upward to the smallest integral value not less + """ + Rounds each value upward to the smallest integral value not less than the original. - Returns a new Series. + Returns + ------- + res + Returns a new Series with ceiling value of each element. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([1.1, 2.8, 3.5, 4.5]) + >>> series + 0 1.1 + 1 2.8 + 2 3.5 + 3 4.5 + dtype: float64 + >>> series.ceil() + 0 2.0 + 1 3.0 + 2 4.0 + 3 5.0 + dtype: float64 """ return self._unaryop("ceil") @@ -4252,6 +4500,19 @@ def digitize(self, bins, right=False): Returns ------- A new Series containing the indices. + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([0.2, 6.4, 3.0, 1.6]) + >>> bins = cudf.Series([0.0, 1.0, 2.5, 4.0, 10.0]) + >>> inds = s.digitize(bins) + >>> inds + 0 1 + 1 4 + 2 3 + 3 2 + dtype: int32 """ return Series( cudf.core.column.numerical.digitize(self._column, bins, right) @@ -4261,10 +4522,61 @@ def diff(self, periods=1): """Calculate the difference between values at positions i and i - N in an array and store the output in a new array. + Returns + ------- + Series + First differences of the Series. + Notes ----- Diff currently only supports float and integer dtype columns with no null values. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([1, 1, 2, 3, 5, 8]) + >>> series + 0 1 + 1 1 + 2 2 + 3 3 + 4 5 + 5 8 + dtype: int64 + + Difference with previous row + + >>> series.diff() + 0 + 1 0 + 2 1 + 3 1 + 4 2 + 5 3 + dtype: int64 + + Difference with 3rd previous row + + >>> series.diff(periods=3) + 0 + 1 + 2 + 3 2 + 4 4 + 5 6 + dtype: int64 + + Difference with following row + + >>> series.diff(periods=-1) + 0 0 + 1 -1 + 2 -1 + 3 -2 + 4 -3 + 5 + dtype: int64 """ if self.has_nulls: raise AssertionError( From f1787b58bbdc0c2b9a24ba8fcba856027d6e536a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 9 Mar 2021 11:41:54 -0800 Subject: [PATCH 2/6] add more examples --- python/cudf/cudf/core/dataframe.py | 15 ++ python/cudf/cudf/core/series.py | 334 ++++++++++++++++++++++++++++- 2 files changed, 344 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8849c1b55a2..f995baea3fd 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3785,6 +3785,21 @@ def label_encoding( Returns ------- a new dataframe with a new column append for the coded values. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a':[1, 2, 3], 'b':[10, 10, 20]}) + >>> df + a b + 0 1 10 + 1 2 10 + 2 3 20 + >>> df.label_encoding(column="b", prefix="b_col", cats=[10, 20]) + a b b_col_labels + 0 1 10 0 + 1 2 10 0 + 2 3 20 1 """ newname = prefix_sep.join([prefix, "labels"]) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 0c9da777bd8..c0cb3cd7705 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1,4 +1,5 @@ # Copyright (c) 2018-2021, NVIDIA CORPORATION. + from __future__ import annotations import pickle @@ -79,8 +80,37 @@ def _constructor_expanddim(self): def from_categorical(cls, categorical, codes=None): """Creates from a pandas.Categorical - If ``codes`` is defined, use it instead of ``categorical.codes`` - """ + Parameters + ---------- + categorical : pandas.Categorical + Contains data stored in a pandas Categorical. + + codes : array-like, optional. + The category codes of this categorical. If ``codes`` are + defined, they are used instead of ``categorical.codes`` + + Returns + ------- + Series + A cudf categorical series. + + Examples + -------- + >>> import cudf + >>> import pandas as pd + >>> pd_categorical = pd.Categorical(pd.Series(['a', 'b', 'c', 'a'], dtype='category')) + >>> pd_categorical + ['a', 'b', 'c', 'a'] + Categories (3, object): ['a', 'b', 'c'] + >>> series = cudf.Series.from_categorical(pd_categorical) + >>> series + 0 a + 1 b + 2 c + 3 a + dtype: category + Categories (3, object): ['a', 'b', 'c'] + """ # noqa: E501 col = cudf.core.column.categorical.pandas_categorical_as_column( categorical, codes=codes ) @@ -106,6 +136,31 @@ def from_masked_array(cls, data, mask, null_count=None): null_count : int, optional The number of null values. If None, it is calculated automatically. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([1, 2, 3, None, 4, None]) + >>> a + 0 1 + 1 2 + 2 3 + 3 + 4 4 + 5 + dtype: int64 + >>> b = cudf.Series([10, 11, 12, 13, 14]) + >>> cudf.Series.from_masked_array(data=b, mask=a._column.mask) + 0 10 + 1 11 + 2 12 + 3 + 4 14 + dtype: int64 """ col = column.as_column(data).set_mask(mask) return cls(data=col) @@ -1300,7 +1355,7 @@ def rsub(self, other, fill_value=None, axis=0): def __rsub__(self, other): return self._binaryop(other, "sub", reflect=True) - def mul(self, other, fill_value=None, axis=0): + def multiply(self, other, fill_value=None, axis=0): """Multiplication of series and other, element-wise (binary operator mul). @@ -1310,11 +1365,43 @@ def mul(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null + + Returns + ------- + Series + The result of the operation. + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd']) + >>> a + a 1 + b 2 + c 3 + d + dtype: int64 + >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e']) + >>> b + a 1 + b + d 2 + e + dtype: int64 + >>> a.multiply(b, fill_value=0) + a 1 + b 0 + c 0 + d 0 + e + dtype: int64 """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "mul", fill_value=fill_value) + mul = multiply + def __mul__(self, other): return self._binaryop(other, "mul") @@ -1346,6 +1433,26 @@ def mod(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null + + Returns + ------- + Series + The result of the operation. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([10, 20, 30]) + >>> series + 0 10 + 1 20 + 2 30 + dtype: int64 + >>> series.mod(4) + 0 2 + 1 0 + 2 2 + dtype: int64 """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") @@ -1418,6 +1525,36 @@ def floordiv(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null + + Returns + ------- + Series + The result of the operation. + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([1, 1, 1, None], index=['a', 'b', 'c', 'd']) + >>> a + a 1 + b 1 + c 1 + d + dtype: int64 + >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e']) + >>> b + a 1 + b + d 1 + e + dtype: int64 + >>> a.floordiv(b) + a 1 + b + c + d + e + dtype: int64 """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") @@ -1821,6 +1958,22 @@ def has_nulls(self): out : bool If Series has atleast one null value, return True, if not return False. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([1, 2, None, 3, 4]) + >>> series + 0 1 + 1 2 + 2 + 3 3 + 4 4 + dtype: int64 + >>> series.has_nulls + True + >>> series.dropna().has_nulls + False """ return self._column.has_nulls @@ -2022,6 +2175,30 @@ def to_array(self, fillna=None): def nans_to_nulls(self): """ Convert nans (if any) to nulls + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> import numpy as np + >>> series = cudf.Series([1, 2, np.nan, None, 10], nan_as_null=False) + >>> series + 0 1.0 + 1 2.0 + 2 NaN + 3 + 4 10.0 + dtype: float64 + >>> series.nans_to_nulls() + 0 1.0 + 1 2.0 + 2 + 3 + 4 10.0 + dtype: float64 """ result_col = self._column.nans_to_nulls() return self._copy_construct(data=result_col) @@ -2253,6 +2430,18 @@ def loc(self): See also -------- cudf.core.dataframe.DataFrame.loc + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([10, 11, 12], index=['a', 'b', 'c']) + >>> series + a 10 + b 11 + c 12 + dtype: int64 + >>> series.loc['b'] + 11 """ return _SeriesLocIndexer(self) @@ -2264,6 +2453,18 @@ def iloc(self): See also -------- cudf.core.dataframe.DataFrame.iloc + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([10, 20, 30]) + >>> s + 0 10 + 1 20 + 2 30 + dtype: int64 + >>> s.iloc[2] + 30 """ return _SeriesIlocIndexer(self) @@ -2294,7 +2495,7 @@ def as_mask(self): 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=uint8) """ - if not pd.api.type.is_bool_dtype(self.dtype): + if not pd.api.types.is_bool_dtype(self.dtype): raise TypeError( f"Series must of boolean dtype, found: {self.dtype}" ) @@ -2523,6 +2724,62 @@ def _n_largest_or_smallest(self, largest, n, keep): def nlargest(self, n=5, keep="first"): """Returns a new Series of the *n* largest element. + + Parameters + ---------- + n : int, default 5 + Return this many descending sorted values. + keep : {'first', 'last'}, default 'first' + When there are duplicate values that cannot all fit in a + Series of `n` elements: + - ``first`` : return the first `n` occurrences in order + of appearance. + - ``last`` : return the last `n` occurrences in reverse + order of appearance. + + Returns + ------- + Series + The `n` largest values in the Series, sorted in decreasing order. + + Examples + -------- + >>> import cudf + >>> countries_population = {"Italy": 59000000, "France": 65000000, + ... "Malta": 434000, "Maldives": 434000, + ... "Brunei": 434000, "Iceland": 337000, + ... "Nauru": 11300, "Tuvalu": 11300, + ... "Anguilla": 11300, "Montserrat": 5200} + >>> series = cudf.Series(countries_population) + >>> series + Italy 59000000 + France 65000000 + Malta 434000 + Maldives 434000 + Brunei 434000 + Iceland 337000 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + Montserrat 5200 + dtype: int64 + >>> series.nlargest() + France 65000000 + Italy 59000000 + Malta 434000 + Maldives 434000 + Brunei 434000 + dtype: int64 + >>> series.nlargest(3) + France 65000000 + Italy 59000000 + Malta 434000 + dtype: int64 + >>> series.nlargest(3, keep='last') + France 65000000 + Italy 59000000 + Brunei 434000 + dtype: int64 """ return self._n_largest_or_smallest(n=n, keep=keep, largest=True) @@ -3854,6 +4111,13 @@ def kurtosis( ----- Parameters currently not supported are `axis`, `level` and `numeric_only` + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4]) + >>> series.kurtosis() + -1.1999999999999904 """ if axis not in (None, 0): raise NotImplementedError("axis parameter is not implemented yet") @@ -4252,12 +4516,53 @@ def floor(self): """Rounds each value downward to the largest integral value not greater than the original. - Returns a new Series. + Returns + ------- + res + Returns a new Series with floor of each element. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([-1.9, 2, 0.2, 1.5, 0.0, 3.0]) + >>> series + 0 -1.9 + 1 2.0 + 2 0.2 + 3 1.5 + 4 0.0 + 5 3.0 + dtype: float64 + >>> series.floor() + 0 -2.0 + 1 2.0 + 2 0.0 + 3 1.0 + 4 0.0 + 5 3.0 + dtype: float64 """ return self._unaryop("floor") def hash_values(self): """Compute the hash of values in this column. + + Returns + ------- + cupy array + A cupy array with hash values. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([10, 120, 30]) + >>> series + 0 10 + 1 120 + 2 30 + dtype: int64 + >>> series.hash_values() + array([-1930516747, 422619251, -941520876], dtype=int32) """ return Series(self._hash()).values @@ -4278,6 +4583,25 @@ def hash_encode(self, stop, use_name=False): ------- result : Series The encoded Series. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([10, 120, 30]) + >>> series.hash_encode(stop=200) + 0 53 + 1 51 + 2 124 + dtype: int32 + + You can choose to include name while hash + encoding by specifying `use_name=True` + + >>> series.hash_encode(stop=200, use_name=True) + 0 131 + 1 29 + 2 76 + dtype: int32 """ assert stop > 0 From a704d57848d86ce7a841a0e351aeccd792606000 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 9 Mar 2021 14:36:53 -0800 Subject: [PATCH 3/6] add more examples --- python/cudf/cudf/core/dataframe.py | 158 ++++++++++- python/cudf/cudf/core/index.py | 16 +- python/cudf/cudf/core/series.py | 439 ++++++++++++++++++++++++++++- 3 files changed, 606 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f995baea3fd..02cb061f4ed 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3561,6 +3561,32 @@ def rename( Rename will not overwite column names. If a list with duplicates is passed, column names will be postfixed with a number. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) + >>> df + A B + 0 1 4 + 1 2 5 + 2 3 6 + + Rename columns using a mapping: + + >>> df.rename(columns={"A": "a", "B": "c"}) + a c + 0 1 4 + 1 2 5 + 2 3 6 + + Rename index using a mapping: + + >>> df.rename(index={0: 10, 1: 20, 2: 30}) + A B + 10 1 4 + 20 2 5 + 30 3 6 """ if errors != "ignore": raise NotImplementedError( @@ -4129,20 +4155,129 @@ def agg(self, aggs, axis=None): def nlargest(self, n, columns, keep="first"): """Get the rows of the DataFrame sorted by the n largest value of *columns* + Parameters + ---------- + n : int + Number of rows to return. + columns : label or list of labels + Column label(s) to order by. + keep : {'first', 'last'}, default 'first' + Where there are duplicate values: + - `first` : prioritize the first occurrence(s) + - `last` : prioritize the last occurrence(s) + + Returns + ------- + DataFrame + The first `n` rows ordered by the given columns in descending + order. + Notes ----- Difference from pandas: - Only a single column is supported in *columns* + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 11300, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) + >>> df + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 11300 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + >>> df.nlargest(3, 'population') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Malta 434000 12011 MT + >>> df.nlargest(3, 'population', keep='last') + population GDP alpha-2 + France 65000000 2583560 FR + Italy 59000000 1937894 IT + Brunei 434000 12128 BN """ return self._n_largest_or_smallest("nlargest", n, columns, keep) def nsmallest(self, n, columns, keep="first"): """Get the rows of the DataFrame sorted by the n smallest value of *columns* + Parameters + ---------- + n : int + Number of items to retrieve. + columns : list or str + Column name or names to order by. + keep : {'first', 'last'}, default 'first' + Where there are duplicate values: + - ``first`` : take the first occurrence. + - ``last`` : take the last occurrence. + + Returns + ------- + DataFrame + Notes ----- Difference from pandas: - Only a single column is supported in *columns* + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'population': [59000000, 65000000, 434000, + ... 434000, 434000, 337000, 337000, + ... 11300, 11300], + ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, + ... 17036, 182, 38, 311], + ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", + ... "IS", "NR", "TV", "AI"]}, + ... index=["Italy", "France", "Malta", + ... "Maldives", "Brunei", "Iceland", + ... "Nauru", "Tuvalu", "Anguilla"]) + >>> df + population GDP alpha-2 + Italy 59000000 1937894 IT + France 65000000 2583560 FR + Malta 434000 12011 MT + Maldives 434000 4520 MV + Brunei 434000 12128 BN + Iceland 337000 17036 IS + Nauru 337000 182 NR + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + + In the following example, we will use ``nsmallest`` to select the + three rows having the smallest values in column "population". + + >>> df.nsmallest(3, 'population') + population GDP alpha-2 + Tuvalu 11300 38 TV + Anguilla 11300 311 AI + Iceland 337000 17036 IS + + When using ``keep='last'``, ties are resolved in reverse order: + + >>> df.nsmallest(3, 'population', keep='last') + population GDP alpha-2 + Anguilla 11300 311 AI + Tuvalu 11300 38 TV + Nauru 337000 182 NR """ return self._n_largest_or_smallest("nsmallest", n, columns, keep) @@ -5745,7 +5880,28 @@ def quantile( non-numeric types and result is expected to be a Series in case of Pandas. cuDF will return a DataFrame as it doesn't support mixed types under Series. - """ + + Examples + -------- + >>> import cupy as cp + >>> import cudf + >>> df = cudf.DataFrame(cp.array([[1, 1], [2, 10], [3, 100], [4, 100]]), + ... columns=['a', 'b']) + >>> df + a b + 0 1 1 + 1 2 10 + 2 3 100 + 3 4 100 + >>> df.quantile(0.1) + a 1.3 + b 3.7 + Name: 0.1, dtype: float64 + >>> df.quantile([.1, .5]) + a b + 0.1 1.3 3.7 + 0.5 2.5 55.0 + """ # noqa: E501 if axis not in (0, None): raise NotImplementedError("axis is not implemented yet") diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index f3b07a40acb..2a5d2647e95 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,4 +1,5 @@ -# Copyright (c) 2018-2020, NVIDIA CORPORATION. +# Copyright (c) 2018-2021, NVIDIA CORPORATION. + from __future__ import annotations, division, print_function import pickle @@ -1178,6 +1179,19 @@ def rename(self, name, inplace=False): ------- Index + Examples + -------- + >>> import cudf + >>> index = cudf.Index([1, 2, 3], name='one') + >>> index + Int64Index([1, 2, 3], dtype='int64', name='one') + >>> index.name + 'one' + >>> renamed_index = index.rename('two') + >>> renamed_index + Int64Index([1, 2, 3], dtype='int64', name='two') + >>> renamed_index.name + 'two' """ if inplace is True: self.name = name diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c0cb3cd7705..b0ebbb6bbbe 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -650,13 +650,74 @@ def reindex(self, index=None, copy=True): Returns ------- A new Series that conforms to the supplied index + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd']) + >>> series + a 10 + b 20 + c 30 + d 40 + dtype: int64 + >>> series.reindex(['a', 'b', 'y', 'z']) + a 10 + b 20 + y + z + dtype: int64 """ name = self.name or 0 idx = self._index if index is None else index - return self.to_frame(name).reindex(idx, copy=copy)[name] + series = self.to_frame(name).reindex(idx, copy=copy)[name] + series.name = self.name + return series def reset_index(self, drop=False, inplace=False): - """ Reset index to RangeIndex """ + """ + Reset index to RangeIndex + + Parameters + ---------- + drop : bool, default False + Just reset the index, without inserting it as a column in + the new DataFrame. + inplace : bool, default False + Modify the Series in place (do not create a new object). + + Returns + ------- + Series or DataFrame or None + When `drop` is False (the default), a DataFrame is returned. + The newly created columns will come first in the DataFrame, + followed by the original Series values. + When `drop` is True, a `Series` is returned. + In either case, if ``inplace=True``, no value is returned. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13]) + >>> series + 10 a + 11 b + 12 c + 13 d + dtype: object + >>> series.reset_index() + index 0 + 0 10 a + 1 11 b + 2 12 c + 3 13 d + >>> series.reset_index(drop=True) + 0 a + 1 b + 2 c + 3 d + dtype: object + """ if not drop: if inplace is True: raise TypeError( @@ -1309,6 +1370,36 @@ def radd(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null + + Returns + ------- + Series + The result of the operation. + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd']) + >>> a + a 1 + b 2 + c 3 + d + dtype: int64 + >>> b = cudf.Series([1, None, 1, None], index=['a', 'b', 'd', 'e']) + >>> b + a 1 + b + d 1 + e + dtype: int64 + >>> a.add(b, fill_value=0) + a 2 + b 2 + c 3 + d 1 + e + dtype: int64 """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") @@ -1347,6 +1438,36 @@ def rsub(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null + + Returns + ------- + Series + The result of the operation. + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd']) + >>> a + a 1 + b 2 + c 3 + d + dtype: int64 + >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e']) + >>> b + a 1 + b + d 2 + e + dtype: int64 + >>> a.rsub(b, fill_value=10) + a 0 + b 8 + c 7 + d -8 + e + dtype: int64 """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") @@ -1415,7 +1536,40 @@ def rmul(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null - """ + + Returns + ------- + Series + The result of the operation. + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([10, 20, None, 30, 40], index=['a', 'b', 'c', 'd', 'e']) + >>> a + a 10 + b 20 + c + d 30 + e 40 + dtype: int64 + >>> b = cudf.Series([None, 1, 20, 5, 4], index=['a', 'b', 'd', 'e', 'f']) + >>> b + a + b 1 + d 20 + e 5 + f 4 + dtype: int64 + >>> a.rmul(b, fill_value=2) + a 20 + b 20 + c + d 600 + e 200 + f 8 + dtype: int64 + """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "mul", fill_value, True) @@ -1471,7 +1625,40 @@ def rmod(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null - """ + + Returns + ------- + Series + The result of the operation. + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([10, 20, None, 30, 40], index=['a', 'b', 'c', 'd', 'e']) + >>> a + a 10 + b 20 + c + d 30 + e 40 + dtype: int64 + >>> b = cudf.Series([None, 1, 20, 5, 4], index=['a', 'b', 'd', 'e', 'f']) + >>> b + a + b 1 + d 20 + e 5 + f 4 + dtype: int64 + >>> a.rmod(b, fill_value=10) + a 0 + b 1 + c + d 20 + e 5 + f 4 + dtype: int64 + """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "mod", fill_value, True) @@ -1489,6 +1676,36 @@ def pow(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null + + Returns + ------- + Series + The result of the operation. + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd']) + >>> a + a 1 + b 2 + c 3 + d + dtype: int64 + >>> b = cudf.Series([10, None, 12, None], index=['a', 'b', 'd', 'e']) + >>> b + a 10 + b + d 12 + e + dtype: int64 + >>> a.pow(b, fill_value=0) + a 1 + b 1 + c 1 + d 0 + e + dtype: int64 """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") @@ -1507,6 +1724,36 @@ def rpow(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null + + Returns + ------- + Series + The result of the operation. + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([1, 2, 3, None], index=['a', 'b', 'c', 'd']) + >>> a + a 1 + b 2 + c 3 + d + dtype: int64 + >>> b = cudf.Series([10, None, 12, None], index=['a', 'b', 'd', 'e']) + >>> b + a 10 + b + d 12 + e + dtype: int64 + >>> a.rpow(b, fill_value=0) + a 10 + b 0 + c 0 + d 1 + e + dtype: int64 """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") @@ -2785,6 +3032,74 @@ def nlargest(self, n=5, keep="first"): def nsmallest(self, n=5, keep="first"): """Returns a new Series of the *n* smallest element. + + Parameters + ---------- + n : int, default 5 + Return this many ascending sorted values. + keep : {'first', 'last'}, default 'first' + When there are duplicate values that cannot all fit in a + Series of `n` elements: + - ``first`` : return the first `n` occurrences in order + of appearance. + - ``last`` : return the last `n` occurrences in reverse + order of appearance. + + Returns + ------- + Series + The `n` smallest values in the Series, sorted in increasing order. + + Examples + -------- + >>> import cudf + >>> countries_population = {"Italy": 59000000, "France": 65000000, + ... "Brunei": 434000, "Malta": 434000, + ... "Maldives": 434000, "Iceland": 337000, + ... "Nauru": 11300, "Tuvalu": 11300, + ... "Anguilla": 11300, "Montserrat": 5200} + >>> s = cudf.Series(countries_population) + >>> s + Italy 59000000 + France 65000000 + Brunei 434000 + Malta 434000 + Maldives 434000 + Iceland 337000 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + Montserrat 5200 + dtype: int64 + + The `n` smallest elements where ``n=5`` by default. + + >>> s.nsmallest() + Montserrat 5200 + Nauru 11300 + Tuvalu 11300 + Anguilla 11300 + Iceland 337000 + dtype: int64 + + The `n` smallest elements where ``n=3``. Default `keep` value is + 'first' so Nauru and Tuvalu will be kept. + + >>> s.nsmallest(3) + Montserrat 5200 + Nauru 11300 + Tuvalu 11300 + dtype: int64 + + The `n` smallest elements where ``n=3`` and keeping the last + duplicates. Anguilla and Tuvalu will be kept since they are the last + with value 11300 based on the index order. + + >>> s.nsmallest(3, keep='last') + Montserrat 5200 + Anguilla 11300 + Tuvalu 11300 + dtype: int64 """ return self._n_largest_or_smallest(n=n, keep=keep, largest=False) @@ -2969,7 +3284,34 @@ def replace( return self._mimic_inplace(result, inplace=inplace) def reverse(self): - """Reverse the Series + """ + Reverse the Series + + Returns + ------- + Series + A reversed Series. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4, 5, 6]) + >>> series + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + dtype: int64 + >>> series.reverse() + 5 6 + 4 5 + 3 4 + 2 3 + 1 2 + 0 1 + dtype: int64 """ rinds = column.arange((self._column.size - 1), -1, -1, dtype=np.int32) col = self._column[rinds] @@ -2991,6 +3333,31 @@ def one_hot_encoding(self, cats, dtype="float64"): Sequence A sequence of new series for each category. Its length is determined by the length of ``cats``. + + Examples + -------- + >>> import cudf + >>> s = cudf.Series(['a', 'b', 'c', 'a']) + >>> s + 0 a + 1 b + 2 c + 3 a + dtype: object + >>> s.one_hot_encoding(['a', 'c', 'b']) + [0 1.0 + 1 0.0 + 2 0.0 + 3 1.0 + dtype: float64, 0 0.0 + 1 0.0 + 2 1.0 + 3 0.0 + dtype: float64, 0 0.0 + 1 1.0 + 2 0.0 + 3 0.0 + dtype: float64] """ if hasattr(cats, "to_arrow"): cats = cats.to_pandas() @@ -4312,6 +4679,31 @@ def unique(self): def nunique(self, method="sort", dropna=True): """Returns the number of unique values of the Series: approximate version, and exact version to be moved to libgdf + + Excludes NA values by default. + + Parameters + ---------- + dropna : bool, default True + Don't include NA values in the count. + + Returns + ------- + int + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([1, 3, 5, 7, 7]) + >>> s + 0 1 + 1 3 + 2 5 + 3 7 + 4 7 + dtype: int64 + >>> s.nunique() + 4 """ if method != "sort": msg = "non sort based distinct_count() not implemented yet" @@ -4641,6 +5033,24 @@ def quantile( If ``q`` is an array, a Series will be returned where the index is ``q`` and the values are the quantiles, otherwise a float will be returned. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4]) + >>> series + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + >>> series.quantile(0.5) + 2.5 + >>> series.quantile([0.25, 0.5, 0.75]) + 0.25 1.75 + 0.50 2.50 + 0.75 3.25 + dtype: float64 """ result = self._column.quantile(q, interpolation, exact) @@ -5024,6 +5434,25 @@ def rename(self, index=None, copy=True): Difference from pandas: - Supports scalar values only for changing name attribute - Not supporting : inplace, level + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([10, 20, 30]) + >>> series + 0 10 + 1 20 + 2 30 + dtype: int64 + >>> series.name + >>> renamed_series = series.rename('numeric_series') + >>> renamed_series + 0 10 + 1 20 + 2 30 + Name: numeric_series, dtype: int64 + >>> renamed_series.name + 'numeric_series' """ out = self.copy(deep=False) out = out.set_index(self.index) From ddfefd26bd0f660dd75d470e0712e8397bf4264d Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 9 Mar 2021 17:59:23 -0800 Subject: [PATCH 4/6] add remaining docs --- python/cudf/cudf/core/series.py | 420 ++++++++++++++++++++++++++++++-- 1 file changed, 402 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index b0ebbb6bbbe..019553ffc13 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -738,6 +738,30 @@ def set_index(self, index): ---------- index : Index, Series-convertible the new index or values for the new index + + Returns + ------- + Series + A new Series with assigned index. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([10, 11, 12, 13, 14]) + >>> series + 0 10 + 1 11 + 2 12 + 3 13 + 4 14 + dtype: int64 + >>> series.set_index(['a', 'b', 'c', 'd', 'e']) + a 10 + b 11 + c 12 + d 13 + e 14 + dtype: int64 """ index = index if isinstance(index, Index) else as_index(index) return self._copy_construct(index=index) @@ -773,7 +797,26 @@ def to_frame(self, name=None): ------- DataFrame cudf DataFrame - """ + + Examples + -------- + >>> import cudf + >>> series = cudf.Series(['a', 'b', 'c', None, 'd'], name='sample', index=[10, 11, 12, 13, 15]) + >>> series + 10 a + 11 b + 12 c + 13 + 15 d + Name: sample, dtype: object + >>> series.to_frame() + sample + 10 a + 11 b + 12 c + 13 + 15 d + """ # noqa: E501 if name is not None: col = name @@ -800,6 +843,38 @@ def set_mask(self, mask, null_count=None): null_count : int, optional The number of null values. If None, it is calculated automatically. + + Returns + ------- + Series + A new series with the applied mask. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4, 5]) + >>> ref_array = cudf.Series([10, None, 11, None, 16]) + >>> series + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + dtype: int64 + >>> ref_array + 0 10 + 1 + 2 11 + 3 + 4 16 + dtype: int64 + >>> series.set_mask(ref_array._column.mask) + 0 1 + 1 + 2 3 + 3 + 4 5 + dtype: int64 """ col = self._column.set_mask(mask) return self._copy_construct(data=col) @@ -1010,14 +1085,49 @@ def __setitem__(self, key, value): self.loc[key] = value def take(self, indices, keep_index=True): - """Return Series by taking values from the corresponding *indices*. + """ + Return Series by taking values from the corresponding *indices*. + + Parameters + ---------- + indices : array-like or scalar + An array/scalar like integers indicating which positions to take. + keep_index : bool, default True + Whethere to retain the index in result Series or not. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([10, 11, 12, 13, 14]) + >>> series + 0 10 + 1 11 + 2 12 + 3 13 + 4 14 + dtype: int64 + >>> series.take([0, 4]) + 0 10 + 4 14 + dtype: int64 + + If you want to drop the index, pass `keep_index=False` + + >>> series.take([0, 4], keep_index=False) + 0 10 + 1 14 + dtype: int64 """ if keep_index is True or is_scalar(indices): return self.iloc[indices] else: col_inds = as_column(indices) data = self._column.take(col_inds, keep_index=False) - return self._copy_construct(data=data) + return self._copy_construct(data=data, index=None) def __bool__(self): """Always raise TypeError when converting a Series @@ -1025,16 +1135,6 @@ def __bool__(self): """ raise TypeError(f"can't compute boolean for {type(self)}") - def values_to_string(self, nrows=None): - """Returns a list of string for each element. - """ - values = self[:nrows] - if self.dtype == np.dtype("object"): - out = [str(v) for v in values] - else: - out = ["" if v is None else str(v) for v in values] - return out - def tolist(self): raise TypeError( @@ -1138,6 +1238,25 @@ def to_string(self): Uses Pandas formatting internals to produce output identical to Pandas. Use the Pandas formatting settings directly in Pandas to control cuDF output. + + Returns + ------- + str + String representation of Series + + Examples + -------- + >>> import cudf + >>> series = cudf.Series(['a', None, 'b', 'c', None]) + >>> series + 0 a + 1 + 2 b + 3 c + 4 + dtype: object + >>> series.to_string() + '0 a\n1 \n2 b\n3 c\n4 \ndtype: object' """ return self.__repr__() @@ -1410,7 +1529,7 @@ def radd(self, other, fill_value=None, axis=0): def __radd__(self, other): return self._binaryop(other, "add", reflect=True) - def sub(self, other, fill_value=None, axis=0): + def subtract(self, other, fill_value=None, axis=0): """Subtraction of series and other, element-wise (binary operator sub). @@ -1420,11 +1539,44 @@ def sub(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null - """ + + Returns + ------- + Series + The result of the operation. + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([10, 20, None, 30, None], index=['a', 'b', 'c', 'd', 'e']) + >>> a + a 10 + b 20 + c + d 30 + e + dtype: int64 + >>> b = cudf.Series([1, None, 2, 30], index=['a', 'c', 'b', 'd']) + >>> b + a 1 + c + b 2 + d 30 + dtype: int64 + >>> a.subtract(b, fill_value=2) + a 9 + b 18 + c + d 0 + e + dtype: int64 + """ # noqa: E501 if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") return self._binaryop(other, "sub", fill_value) + sub = subtract + def __sub__(self, other): return self._binaryop(other, "sub") @@ -1876,6 +2028,36 @@ def truediv(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null + + Returns + ------- + Series + The reuslt of the operation. + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([1, 10, 20, None], index=['a', 'b', 'c', 'd']) + >>> a + a 1 + b 10 + c 20 + d + dtype: int64 + >>> b = cudf.Series([1, None, 2, None], index=['a', 'b', 'd', 'e']) + >>> b + a 1 + b + d 2 + e + dtype: int64 + >>> a.truediv(b, fill_value=0) + a 1.0 + b Inf + c Inf + d 0.0 + e + dtype: float64 """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") @@ -1894,6 +2076,36 @@ def rtruediv(self, other, fill_value=None, axis=0): fill_value : None or value Value to fill nulls with before computation. If data in both corresponding Series locations is null the result will be null + + Returns + ------- + Series + The result of the operation. + + Examples + -------- + >>> import cudf + >>> a = cudf.Series([10, 20, None, 30], index=['a', 'b', 'c', 'd']) + >>> a + a 10 + b 20 + c + d 30 + dtype: int64 + >>> b = cudf.Series([1, None, 2, 3], index=['a', 'b', 'd', 'e']) + >>> b + a 1 + b + d 2 + e 3 + dtype: int64 + >>> a.rtruediv(b, fill_value=0) + a 0.1 + b 0.0 + c + d 0.066666667 + e Inf + dtype: float64 """ if axis != 0: raise NotImplementedError("Only axis=0 supported at this time.") @@ -2411,11 +2623,32 @@ def to_array(self, fillna=None): If it equals "pandas", null values are filled with NaNs. Non integral dtype is promoted to np.float64. + Returns + ------- + numpy.ndarray + A numpy array representation of the elements in the Series. + Notes ----- - If ``fillna`` is ``None``, null values are skipped. Therefore, the output size could be smaller. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([10, 11, 12, 13, 14]) + >>> series + 0 10 + 1 11 + 2 12 + 3 13 + 4 14 + dtype: int64 + >>> array = series.to_array() + >>> array + array([10, 11, 12, 13, 14]) + >>> type(array) + """ return self._column.to_array(fillna=fillna) @@ -2566,6 +2799,24 @@ def to_gpu_array(self, fillna=None): if ``fillna`` is ``None``, null values are skipped. Therefore, the output size could be smaller. + + Returns + ------- + numba DeviceNDArray + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([10, 20, 30, 40, 50]) + >>> s + 0 10 + 1 20 + 2 30 + 3 40 + 4 50 + dtype: int64 + >>> s.to_gpu_array() + """ return self._column.to_gpu_array(fillna=fillna) @@ -2888,7 +3139,44 @@ def argsort(self, ascending=True, na_position="last"): return self._sort(ascending=ascending, na_position=na_position)[1] def sort_index(self, ascending=True): - """Sort by the index. + """ + Sort by the index. + + Parameters + ---------- + ascending : bool, default True + Sort ascending vs. descending. + + Returns + ------- + Series + The original Series sorted by the labels. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[3, 2, 1, 4]) + >>> series + 3 a + 2 b + 1 c + 4 d + dtype: object + >>> series.sort_index() + 1 c + 2 b + 3 a + 4 d + dtype: object + + Sort Descending + + >>> series.sort_index(ascending=False) + 4 d + 3 a + 2 b + 1 c + dtype: object """ inds = self.index.argsort(ascending=ascending) return self.take(inds) @@ -4238,6 +4526,22 @@ def std( ----- Parameters currently not supported are `axis`, `level` and `numeric_only` + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([10, 10, 20, 30, 40]) + >>> series + 0 10 + 1 10 + 2 20 + 3 30 + 4 40 + dtype: int64 + >>> series.std() + 13.038404810405298 + >>> series.std(ddof=2) + 15.05545305418162 """ if axis not in (None, 0): @@ -4287,6 +4591,20 @@ def var( ----- Parameters currently not supported are `axis`, `level` and `numeric_only` + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([10, 11, 12, 0, 1]) + >>> series + 0 10 + 1 11 + 2 12 + 3 0 + 4 1 + dtype: int64 + >>> series.var() + 33.7 """ if axis not in (None, 0): @@ -4521,6 +4839,22 @@ def skew( ----- Parameters currently not supported are `axis`, `level` and `numeric_only` + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([1, 2, 3, 4, 5, 6, 6]) + >>> series + 0 1 + 1 2 + 2 3 + 3 4 + 4 5 + 5 6 + 6 6 + dtype: int64 + >>> series.skew() + -0.288195490292614 """ if axis not in (None, 0): @@ -4672,6 +5006,31 @@ def isin(self, values): def unique(self): """ Returns unique values of this Series. + + Returns + ------- + Series + A series with only the unique values. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series(['a', 'a', 'b', None, 'b', None, 'c']) + >>> series + 0 a + 1 a + 2 b + 3 + 4 b + 5 + 6 c + dtype: object + >>> series.unique() + 0 + 1 a + 2 b + 3 c + dtype: object """ res = self._column.unique() return Series(res, name=self.name) @@ -4836,7 +5195,32 @@ def value_counts( return res def scale(self): - """Scale values to [0, 1] in float64 + """ + Scale values to [0, 1] in float64 + + Returns + ------- + Series + A new series with values scaled to [0, 1]. + + Examples + -------- + >>> import cudf + >>> series = cudf.Series([10, 11, 12, 0.5, 1]) + >>> series + 0 10.0 + 1 11.0 + 2 12.0 + 3 0.5 + 4 1.0 + dtype: float64 + >>> series.scale() + 0 0.826087 + 1 0.913043 + 2 1.000000 + 3 0.000000 + 4 0.043478 + dtype: float64 """ vmin = self.min() vmax = self.max() From 55321f021d6b9a0f8957b46d61282bd795591ee0 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 9 Mar 2021 19:13:03 -0800 Subject: [PATCH 5/6] bring more test coverage for series.take --- python/cudf/cudf/tests/test_indexing.py | 26 ++++++++++++------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py index 73a074c0376..15d504799e4 100644 --- a/python/cudf/cudf/tests/test_indexing.py +++ b/python/cudf/cudf/tests/test_indexing.py @@ -709,20 +709,18 @@ def test_series_take(ntake, keep_index): np.random.seed(0) nelem = 123 - data = np.random.randint(0, 20, nelem) - sr = cudf.Series(data) - - take_indices = np.random.randint(0, len(sr), ntake) - - if keep_index is True: - out = sr.take(take_indices) - np.testing.assert_array_equal(out.to_array(), data[take_indices]) - elif keep_index is False: - out = sr.take(take_indices, keep_index=False) - np.testing.assert_array_equal(out.to_array(), data[take_indices]) - np.testing.assert_array_equal( - out.index.to_array(), sr.index.to_array() - ) + psr = pd.Series(np.random.randint(0, 20, nelem)) + gsr = cudf.Series(psr) + + take_indices = np.random.randint(0, len(gsr), ntake) + + actual = gsr.take(take_indices, keep_index=keep_index) + expected = psr.take(take_indices) + + if not keep_index: + expected = expected.reset_index(drop=True) + + assert_eq(actual, expected) def test_series_take_positional(): From 2839b7c71b41ac3f7ed3d7197c08644b6ad4daaf Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 9 Mar 2021 19:49:01 -0800 Subject: [PATCH 6/6] minor alignment fixes --- python/cudf/cudf/core/dataframe.py | 2 ++ python/cudf/cudf/core/series.py | 25 +++++++++++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 02cb061f4ed..5ab058ff495 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4163,6 +4163,7 @@ def nlargest(self, n, columns, keep="first"): Column label(s) to order by. keep : {'first', 'last'}, default 'first' Where there are duplicate values: + - `first` : prioritize the first occurrence(s) - `last` : prioritize the last occurrence(s) @@ -4225,6 +4226,7 @@ def nsmallest(self, n, columns, keep="first"): Column name or names to order by. keep : {'first', 'last'}, default 'first' Where there are duplicate values: + - ``first`` : take the first occurrence. - ``last`` : take the last occurrence. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 019553ffc13..2a990eef32e 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1256,8 +1256,8 @@ def to_string(self): 4 dtype: object >>> series.to_string() - '0 a\n1 \n2 b\n3 c\n4 \ndtype: object' - """ + '0 a\\n1 \\n2 b\\n3 c\\n4 \\ndtype: object' + """ # noqa : E501 return self.__repr__() def __str__(self): @@ -2523,6 +2523,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): ---------- keep : {'first', 'last', ``False``}, default 'first' Method to handle dropping duplicates: + - 'first' : Drop duplicates except for the first occurrence. - 'last' : Drop duplicates except for the last occurrence. - ``False`` : Drop all duplicates. @@ -3017,11 +3018,12 @@ def astype(self, dtype, copy=False, errors="raise"): values then may propagate to other cudf objects. errors : {'raise', 'ignore', 'warn'}, default 'raise' Control raising of exceptions on invalid data for provided dtype. + - ``raise`` : allow exceptions to be raised - ``ignore`` : suppress exceptions. On error return original - object. + object. - ``warn`` : prints last exceptions as warnings and - return original object. + return original object. Returns ------- @@ -3059,7 +3061,7 @@ def astype(self, dtype, copy=False, errors="raise"): dtype: category Categories (2, int64): [2 < 1] - Note that using `copy=False`(enabled by default) + Note that using ``copy=False`` (enabled by default) and changing data on a new Series will propagate changes: @@ -3267,10 +3269,11 @@ def nlargest(self, n=5, keep="first"): keep : {'first', 'last'}, default 'first' When there are duplicate values that cannot all fit in a Series of `n` elements: + - ``first`` : return the first `n` occurrences in order - of appearance. + of appearance. - ``last`` : return the last `n` occurrences in reverse - order of appearance. + order of appearance. Returns ------- @@ -3319,7 +3322,8 @@ def nlargest(self, n=5, keep="first"): return self._n_largest_or_smallest(n=n, keep=keep, largest=True) def nsmallest(self, n=5, keep="first"): - """Returns a new Series of the *n* smallest element. + """ + Returns a new Series of the *n* smallest element. Parameters ---------- @@ -3328,10 +3332,11 @@ def nsmallest(self, n=5, keep="first"): keep : {'first', 'last'}, default 'first' When there are duplicate values that cannot all fit in a Series of `n` elements: + - ``first`` : return the first `n` occurrences in order - of appearance. + of appearance. - ``last`` : return the last `n` occurrences in reverse - order of appearance. + order of appearance. Returns -------