Skip to content

Commit

Permalink
Accept fold index for TargetEncoder (rapidsai#4453)
Browse files Browse the repository at this point in the history
As requested in issue rapidsai#4441, in this PR we let TargetEncoder accept a customized fold index array in `fit()`
For example, in the following code
```
X = [1, 2, 3, 1, 2]
y = [1, 0, 0, 0, 1]
fold_id = [0,1,0,0,1]
encoder = TargetEncoder(split_method='customize')
encoder.fit(X,y,fold_id=fold_id)
``` 
The target encoder will fit subarray of `X` and `y` where `fold_id==0` to encode the subarray of `X` where `fold_id==1`, and vice versa.

Authors:
  - Jiwei Liu (https://github.com/daxiongshu)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: rapidsai#4453
  • Loading branch information
daxiongshu authored Jan 20, 2022
1 parent 88d0e42 commit ac4db43
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 12 deletions.
69 changes: 58 additions & 11 deletions python/cuml/preprocessing/TargetEncoder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -19,6 +19,7 @@
import cupy as cp
import numpy as np
from cuml.common.exceptions import NotFittedError
import warnings


class TargetEncoder:
Expand Down Expand Up @@ -47,6 +48,8 @@ class TargetEncoder:
'random': random split.
'continuous': consecutive samples are grouped into one folds.
'interleaved': samples are assign to each fold in a round robin way.
'customize': customize splitting by providing a `fold_ids` array
in `fit()` or `fit_transform()` functions.
output_type: {'cupy', 'numpy', 'auto'}, default = 'auto'
The data type of output. If 'auto', it matches input data.
Expand Down Expand Up @@ -96,9 +99,10 @@ def __init__(self, n_folds=4, smooth=0, seed=42,
if not isinstance(seed, int):
raise ValueError('seed {} is not an integer'.format(seed))

if split_method not in {'random', 'continuous', 'interleaved'}:
if split_method not in {'random', 'continuous', 'interleaved',
'customize'}:
msg = ("split_method should be either 'random'"
" or 'continuous' or 'interleaved', "
" or 'continuous' or 'interleaved', or 'customize'"
"got {0}.".format(self.split))
raise ValueError(msg)

Expand All @@ -114,7 +118,7 @@ def __init__(self, n_folds=4, smooth=0, seed=42,
self.train = None
self.output_type = output_type

def fit(self, x, y):
def fit(self, x, y, fold_ids=None):
"""
Fit a TargetEncoder instance to a set of categories
Expand All @@ -125,26 +129,62 @@ def fit(self, x, y):
not be unique
y : cudf.Series or cupy.ndarray
Series containing the target variable.
fold_ids: cudf.Series or cupy.ndarray
Series containing the indices of the customized
folds. Its values should be integers in range
`[0, N-1]` to split data into `N` folds. If None,
fold_ids is generated based on `split_method`.
Returns
-------
self : TargetEncoder
A fitted instance of itself to allow method chaining
"""
res, train = self._fit_transform(x, y)
if self.split == 'customize' and fold_ids is None:
raise ValueError("`fold_ids` is required "
"since split_method is set to"
"'customize'.")
if fold_ids is not None and self.split != 'customize':
self.split == 'customize'
warnings.warn("split_method is set to 'customize'"
"since `fold_ids` are provided.")
if fold_ids is not None and len(fold_ids) != len(x):
raise ValueError(f"`fold_ids` length {len(fold_ids)}"
"is different from input data length"
f"{len(x)}")

res, train = self._fit_transform(x, y, fold_ids=fold_ids)
self.train_encode = res
self.train = train
self._fitted = True
return self

def fit_transform(self, x, y):
def fit_transform(self, x, y, fold_ids=None):
"""
Simultaneously fit and transform an input
This is functionally equivalent to (but faster than)
`TargetEncoder().fit(y).transform(y)`
Parameters
----------
x: cudf.Series or cudf.DataFrame or cupy.ndarray
categories to be encoded. It's elements may or may
not be unique
y : cudf.Series or cupy.ndarray
Series containing the target variable.
fold_ids: cudf.Series or cupy.ndarray
Series containing the indices of the customized
folds. Its values should be integers in range
`[0, N-1]` to split data into `N` folds. If None,
fold_ids is generated based on `split_method`.
Returns
-------
encoded : cupy.ndarray
The ordinally encoded input series
"""
self.fit(x, y)
self.fit(x, y, fold_ids=fold_ids)
return self.train_encode

def transform(self, x):
Expand Down Expand Up @@ -174,7 +214,7 @@ def transform(self, x):
test = test.merge(self.encode_all, on=x_cols, how='left')
return self._impute_and_sort(test)

def _fit_transform(self, x, y):
def _fit_transform(self, x, y, fold_ids):
"""
Core function of target encoding
"""
Expand All @@ -185,7 +225,7 @@ def _fit_transform(self, x, y):
train[self.y_col] = self._make_y_column(y)

self.n_folds = min(self.n_folds, len(train))
train[self.fold_col] = self._make_fold_column(len(train))
train[self.fold_col] = self._make_fold_column(len(train), fold_ids)

self.mean = train[self.y_col].mean()

Expand Down Expand Up @@ -237,17 +277,24 @@ def _make_y_column(self, y):
"or numpy.ndarray"
"or cupy.ndarray")

def _make_fold_column(self, len_train):
def _make_fold_column(self, len_train, fold_ids):
"""
Create a fold id column for each split_method
"""

if self.split == 'random':
return cp.random.randint(0, self.n_folds, len_train)
elif self.split == 'continuous':
return (cp.arange(len_train) /
(len_train/self.n_folds)) % self.n_folds
elif self.split == 'interleaved':
return cp.arange(len_train) % self.n_folds
elif self.split == 'customize':
if fold_ids is None:
raise ValueError("fold_ids can't be None"
"since split_method is set to"
"'customize'.")
return fold_ids
else:
msg = ("split should be either 'random'"
" or 'continuous' or 'interleaved', "
Expand Down
26 changes: 25 additions & 1 deletion python/cuml/test/test_target_encoder.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -204,3 +204,27 @@ def test_targetencoder_smooth():
train_encoded = encoder.transform(train.category)

assert array_equal(train_encoded, answer)


def test_targetencoder_customized_fold_id():
"""
use customized `fold_ids` array to split data.
in this example, the 1st sample belongs to `fold 0`
the 2nd and 3rd sample belongs to `fold 1`
and the 4th sample belongs to `fold 2`
"""
train = cudf.DataFrame({'category': ['a', 'b', 'b', 'a'],
'label': [1, 0, 1, 1]})
fold_ids = [0, 1, 1, 2]
encoder = TargetEncoder(split_method='customize')
train_encoded = encoder.fit_transform(train.category, train.label,
fold_ids=fold_ids)
answer = np.array([1., 0.75, 0.75, 1.])
assert array_equal(train_encoded, answer)

encoder = TargetEncoder(split_method='customize')
encoder.fit(train.category, train.label,
fold_ids=fold_ids)
train_encoded = encoder.transform(train.category)

assert array_equal(train_encoded, answer)

0 comments on commit ac4db43

Please sign in to comment.