Skip to content

Commit

Permalink
Add Dask LabelEncoder to the documentation (#5023)
Browse files Browse the repository at this point in the history
This will close #4931

Authors:
  - Nick Becker (https://github.com/beckernick)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: #5023
  • Loading branch information
beckernick authored Dec 18, 2022
1 parent 90fd209 commit 28e9f30
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 75 deletions.
3 changes: 3 additions & 0 deletions docs/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ Feature and Label Encoding (Dask-based Multi-GPU)
.. autoclass:: cuml.dask.preprocessing.LabelBinarizer
:members:

.. autoclass:: cuml.dask.preprocessing.LabelEncoder.LabelEncoder
:members:

.. autoclass:: cuml.dask.preprocessing.OneHotEncoder
:members:

Expand Down
152 changes: 77 additions & 75 deletions python/cuml/dask/preprocessing/LabelEncoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class LabelEncoder(BaseEstimator,
DelayedTransformMixin,
DelayedInverseTransformMixin):
"""
An nvcategory based implementation of ordinal label encoding
A cuDF-based implementation of ordinal label encoding
Parameters
----------
Expand All @@ -43,78 +43,80 @@ class LabelEncoder(BaseEstimator,
--------
Converting a categorical implementation to a numerical one
>>> from dask_cuda import LocalCUDACluster
>>> from dask.distributed import Client
>>> import cudf
>>> import dask_cudf
>>> from cuml.dask.preprocessing import LabelEncoder
>>> import pandas as pd
>>> pd.set_option('display.max_colwidth', 2000)
>>> cluster = LocalCUDACluster(threads_per_worker=1)
>>> client = Client(cluster)
>>> df = cudf.DataFrame({'num_col':[10, 20, 30, 30, 30],
... 'cat_col':['a','b','c','a','a']})
>>> ddf = dask_cudf.from_cudf(df, npartitions=2)
>>> # There are two functionally equivalent ways to do this
>>> le = LabelEncoder()
>>> le.fit(ddf.cat_col) # le = le.fit(data.category) also works
<cuml.dask.preprocessing.LabelEncoder.LabelEncoder object at 0x...>
>>> encoded = le.transform(ddf.cat_col)
>>> print(encoded.compute())
0 0
1 1
2 2
3 0
4 0
dtype: uint8
>>> # This method is preferred
>>> le = LabelEncoder()
>>> encoded = le.fit_transform(ddf.cat_col)
>>> print(encoded.compute())
0 0
1 1
2 2
3 0
4 0
dtype: uint8
>>> # We can assign this to a new column
>>> ddf = ddf.assign(encoded=encoded.values)
>>> print(ddf.compute())
num_col cat_col encoded
0 10 a 0
1 20 b 1
2 30 c 2
3 30 a 0
4 30 a 0
>>> # We can also encode more data
>>> test_data = cudf.Series(['c', 'a'])
>>> encoded = le.transform(dask_cudf.from_cudf(test_data,
... npartitions=2))
>>> print(encoded.compute())
0 2
1 0
dtype: uint8
>>> # After train, ordinal label can be inverse_transform() back to
>>> # string labels
>>> ord_label = cudf.Series([0, 0, 1, 2, 1])
>>> ord_label = le.inverse_transform(
... dask_cudf.from_cudf(ord_label,npartitions=2))
>>> print(ord_label.compute())
0 a
1 a
2 b
0 c
1 b
dtype: object
>>> client.close()
>>> cluster.close()
.. code-block:: python
>>> from dask_cuda import LocalCUDACluster
>>> from dask.distributed import Client
>>> import cudf
>>> import dask_cudf
>>> from cuml.dask.preprocessing import LabelEncoder
>>> import pandas as pd
>>> pd.set_option('display.max_colwidth', 2000)
>>> cluster = LocalCUDACluster(threads_per_worker=1)
>>> client = Client(cluster)
>>> df = cudf.DataFrame({'num_col':[10, 20, 30, 30, 30],
... 'cat_col':['a','b','c','a','a']})
>>> ddf = dask_cudf.from_cudf(df, npartitions=2)
>>> # There are two functionally equivalent ways to do this
>>> le = LabelEncoder()
>>> le.fit(ddf.cat_col) # le = le.fit(data.category) also works
<cuml.dask.preprocessing.LabelEncoder.LabelEncoder object at 0x...>
>>> encoded = le.transform(ddf.cat_col)
>>> print(encoded.compute())
0 0
1 1
2 2
3 0
4 0
dtype: uint8
>>> # This method is preferred
>>> le = LabelEncoder()
>>> encoded = le.fit_transform(ddf.cat_col)
>>> print(encoded.compute())
0 0
1 1
2 2
3 0
4 0
dtype: uint8
>>> # We can assign this to a new column
>>> ddf = ddf.assign(encoded=encoded.values)
>>> print(ddf.compute())
num_col cat_col encoded
0 10 a 0
1 20 b 1
2 30 c 2
3 30 a 0
4 30 a 0
>>> # We can also encode more data
>>> test_data = cudf.Series(['c', 'a'])
>>> encoded = le.transform(dask_cudf.from_cudf(test_data,
... npartitions=2))
>>> print(encoded.compute())
0 2
1 0
dtype: uint8
>>> # After train, ordinal label can be inverse_transform() back to
>>> # string labels
>>> ord_label = cudf.Series([0, 0, 1, 2, 1])
>>> ord_label = le.inverse_transform(
... dask_cudf.from_cudf(ord_label,npartitions=2))
>>> print(ord_label.compute())
0 a
1 a
2 b
0 c
1 b
dtype: object
>>> client.close()
>>> cluster.close()
"""
def __init__(self, *, client=None, verbose=False, **kwargs):
Expand All @@ -124,7 +126,7 @@ def __init__(self, *, client=None, verbose=False, **kwargs):

def fit(self, y):
"""
Fit a LabelEncoder (nvcategory) instance to a set of categories
Fit a LabelEncoder instance to a set of categories
Parameters
----------
Expand All @@ -138,7 +140,7 @@ def fit(self, y):
A fitted instance of itself to allow method chaining
Notes
--------
-----
Number of unique classes will be collected at the client. It'll
consume memory proportional to the number of unique classes.
"""
Expand Down

0 comments on commit 28e9f30

Please sign in to comment.