Accept fold index for TargetEncoder (rapidsai#4453)

As requested in issue rapidsai#4441, in this PR we let TargetEncoder accept a customized fold index array in `fit()` For example, in the following code ``` X = [1, 2, 3, 1, 2] y = [1, 0, 0, 0, 1] fold_id = [0,1,0,0,1] encoder = TargetEncoder(split_method='customize') encoder.fit(X,y,fold_id=fold_id) ``` The target encoder will fit subarray of `X` and `y` where `fold_id==0` to encode the subarray of `X` where `fold_id==1`, and vice versa. Authors: - Jiwei Liu (https://github.com/daxiongshu) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: rapidsai#4453
vimarsh6739 · Jan 20, 2022 · ac4db43 · ac4db43
1 parent 88d0e42
commit ac4db43
Show file tree

Hide file tree

Showing 2 changed files with 83 additions and 12 deletions.
diff --git a/python/cuml/preprocessing/TargetEncoder.py b/python/cuml/preprocessing/TargetEncoder.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 import cupy as cp
 import numpy as np
 from cuml.common.exceptions import NotFittedError
+import warnings
 
 
 class TargetEncoder:
@@ -47,6 +48,8 @@ class TargetEncoder:
         'random': random split.
         'continuous': consecutive samples are grouped into one folds.
         'interleaved': samples are assign to each fold in a round robin way.
+        'customize': customize splitting by providing a `fold_ids` array
+                     in `fit()` or `fit_transform()` functions.
     output_type: {'cupy', 'numpy', 'auto'}, default = 'auto'
         The data type of output. If 'auto', it matches input data.
 
@@ -96,9 +99,10 @@ def __init__(self, n_folds=4, smooth=0, seed=42,
         if not isinstance(seed, int):
             raise ValueError('seed {} is not an integer'.format(seed))
 
-        if split_method not in {'random', 'continuous', 'interleaved'}:
+        if split_method not in {'random', 'continuous', 'interleaved',
+                                'customize'}:
             msg = ("split_method should be either 'random'"
-                   " or 'continuous' or 'interleaved', "
+                   " or 'continuous' or 'interleaved', or 'customize'"
                    "got {0}.".format(self.split))
             raise ValueError(msg)
 
@@ -114,7 +118,7 @@ def __init__(self, n_folds=4, smooth=0, seed=42,
         self.train = None
         self.output_type = output_type
 
-    def fit(self, x, y):
+    def fit(self, x, y, fold_ids=None):
         """
         Fit a TargetEncoder instance to a set of categories
 
@@ -125,26 +129,62 @@ def fit(self, x, y):
            not be unique
         y : cudf.Series or cupy.ndarray
             Series containing the target variable.
-
+        fold_ids: cudf.Series or cupy.ndarray
+            Series containing the indices of the customized
+            folds. Its values should be integers in range
+            `[0, N-1]` to split data into `N` folds. If None,
+            fold_ids is generated based on `split_method`.
         Returns
         -------
         self : TargetEncoder
             A fitted instance of itself to allow method chaining
         """
-        res, train = self._fit_transform(x, y)
+        if self.split == 'customize' and fold_ids is None:
+            raise ValueError("`fold_ids` is required "
+                             "since split_method is set to"
+                             "'customize'.")
+        if fold_ids is not None and self.split != 'customize':
+            self.split == 'customize'
+            warnings.warn("split_method is set to 'customize'"
+                          "since `fold_ids` are provided.")
+        if fold_ids is not None and len(fold_ids) != len(x):
+            raise ValueError(f"`fold_ids` length {len(fold_ids)}"
+                             "is different from input data length"
+                             f"{len(x)}")
+
+        res, train = self._fit_transform(x, y, fold_ids=fold_ids)
         self.train_encode = res
         self.train = train
         self._fitted = True
         return self
 
-    def fit_transform(self, x, y):
+    def fit_transform(self, x, y, fold_ids=None):
         """
         Simultaneously fit and transform an input
 
         This is functionally equivalent to (but faster than)
         `TargetEncoder().fit(y).transform(y)`
+
+        Parameters
+        ----------
+        x: cudf.Series or cudf.DataFrame or cupy.ndarray
+           categories to be encoded. It's elements may or may
+           not be unique
+        y : cudf.Series or cupy.ndarray
+            Series containing the target variable.
+        fold_ids: cudf.Series or cupy.ndarray
+            Series containing the indices of the customized
+            folds. Its values should be integers in range
+            `[0, N-1]` to split data into `N` folds. If None,
+            fold_ids is generated based on `split_method`.
+
+        Returns
+        -------
+        encoded : cupy.ndarray
+            The ordinally encoded input series
+
         """
-        self.fit(x, y)
+        self.fit(x, y, fold_ids=fold_ids)
         return self.train_encode
 
     def transform(self, x):
@@ -174,7 +214,7 @@ def transform(self, x):
         test = test.merge(self.encode_all, on=x_cols, how='left')
         return self._impute_and_sort(test)
 
-    def _fit_transform(self, x, y):
+    def _fit_transform(self, x, y, fold_ids):
         """
         Core function of target encoding
         """
@@ -185,7 +225,7 @@ def _fit_transform(self, x, y):
         train[self.y_col] = self._make_y_column(y)
 
         self.n_folds = min(self.n_folds, len(train))
-        train[self.fold_col] = self._make_fold_column(len(train))
+        train[self.fold_col] = self._make_fold_column(len(train), fold_ids)
 
         self.mean = train[self.y_col].mean()
 
@@ -237,17 +277,24 @@ def _make_y_column(self, y):
                 "or numpy.ndarray"
                 "or cupy.ndarray")
 
-    def _make_fold_column(self, len_train):
+    def _make_fold_column(self, len_train, fold_ids):
         """
         Create a fold id column for each split_method
         """
+
         if self.split == 'random':
             return cp.random.randint(0, self.n_folds, len_train)
         elif self.split == 'continuous':
             return (cp.arange(len_train) /
                     (len_train/self.n_folds)) % self.n_folds
         elif self.split == 'interleaved':
             return cp.arange(len_train) % self.n_folds
+        elif self.split == 'customize':
+            if fold_ids is None:
+                raise ValueError("fold_ids can't be None"
+                                 "since split_method is set to"
+                                 "'customize'.")
+            return fold_ids
         else:
             msg = ("split should be either 'random'"
                    " or 'continuous' or 'interleaved', "

diff --git a/python/cuml/test/test_target_encoder.py b/python/cuml/test/test_target_encoder.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2021, NVIDIA CORPORATION.
+# Copyright (c) 2019-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -204,3 +204,27 @@ def test_targetencoder_smooth():
         train_encoded = encoder.transform(train.category)
 
         assert array_equal(train_encoded, answer)
+
+
+def test_targetencoder_customized_fold_id():
+    """
+    use customized `fold_ids` array to split data.
+    in this example, the 1st sample belongs to `fold 0`
+    the 2nd and 3rd sample belongs to `fold 1`
+    and the 4th sample belongs to `fold 2`
+    """
+    train = cudf.DataFrame({'category': ['a', 'b', 'b', 'a'],
+                            'label': [1, 0, 1, 1]})
+    fold_ids = [0, 1, 1, 2]
+    encoder = TargetEncoder(split_method='customize')
+    train_encoded = encoder.fit_transform(train.category, train.label,
+                                          fold_ids=fold_ids)
+    answer = np.array([1., 0.75, 0.75, 1.])
+    assert array_equal(train_encoded, answer)
+
+    encoder = TargetEncoder(split_method='customize')
+    encoder.fit(train.category, train.label,
+                fold_ids=fold_ids)
+    train_encoded = encoder.transform(train.category)
+
+    assert array_equal(train_encoded, answer)