From ac4db43e851d09fe9f0a596c5c8de8621616b40b Mon Sep 17 00:00:00 2001 From: Jiwei Liu Date: Thu, 20 Jan 2022 11:47:22 -0500 Subject: [PATCH] Accept fold index for TargetEncoder (#4453) As requested in issue #4441, in this PR we let TargetEncoder accept a customized fold index array in `fit()` For example, in the following code ``` X = [1, 2, 3, 1, 2] y = [1, 0, 0, 0, 1] fold_id = [0,1,0,0,1] encoder = TargetEncoder(split_method='customize') encoder.fit(X,y,fold_id=fold_id) ``` The target encoder will fit subarray of `X` and `y` where `fold_id==0` to encode the subarray of `X` where `fold_id==1`, and vice versa. Authors: - Jiwei Liu (https://github.com/daxiongshu) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: https://github.com/rapidsai/cuml/pull/4453 --- python/cuml/preprocessing/TargetEncoder.py | 69 ++++++++++++++++++---- python/cuml/test/test_target_encoder.py | 26 +++++++- 2 files changed, 83 insertions(+), 12 deletions(-) diff --git a/python/cuml/preprocessing/TargetEncoder.py b/python/cuml/preprocessing/TargetEncoder.py index 8f885cce9b..07e63b75ff 100644 --- a/python/cuml/preprocessing/TargetEncoder.py +++ b/python/cuml/preprocessing/TargetEncoder.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ import cupy as cp import numpy as np from cuml.common.exceptions import NotFittedError +import warnings class TargetEncoder: @@ -47,6 +48,8 @@ class TargetEncoder: 'random': random split. 'continuous': consecutive samples are grouped into one folds. 'interleaved': samples are assign to each fold in a round robin way. + 'customize': customize splitting by providing a `fold_ids` array + in `fit()` or `fit_transform()` functions. output_type: {'cupy', 'numpy', 'auto'}, default = 'auto' The data type of output. If 'auto', it matches input data. @@ -96,9 +99,10 @@ def __init__(self, n_folds=4, smooth=0, seed=42, if not isinstance(seed, int): raise ValueError('seed {} is not an integer'.format(seed)) - if split_method not in {'random', 'continuous', 'interleaved'}: + if split_method not in {'random', 'continuous', 'interleaved', + 'customize'}: msg = ("split_method should be either 'random'" - " or 'continuous' or 'interleaved', " + " or 'continuous' or 'interleaved', or 'customize'" "got {0}.".format(self.split)) raise ValueError(msg) @@ -114,7 +118,7 @@ def __init__(self, n_folds=4, smooth=0, seed=42, self.train = None self.output_type = output_type - def fit(self, x, y): + def fit(self, x, y, fold_ids=None): """ Fit a TargetEncoder instance to a set of categories @@ -125,26 +129,62 @@ def fit(self, x, y): not be unique y : cudf.Series or cupy.ndarray Series containing the target variable. - + fold_ids: cudf.Series or cupy.ndarray + Series containing the indices of the customized + folds. Its values should be integers in range + `[0, N-1]` to split data into `N` folds. If None, + fold_ids is generated based on `split_method`. Returns ------- self : TargetEncoder A fitted instance of itself to allow method chaining """ - res, train = self._fit_transform(x, y) + if self.split == 'customize' and fold_ids is None: + raise ValueError("`fold_ids` is required " + "since split_method is set to" + "'customize'.") + if fold_ids is not None and self.split != 'customize': + self.split == 'customize' + warnings.warn("split_method is set to 'customize'" + "since `fold_ids` are provided.") + if fold_ids is not None and len(fold_ids) != len(x): + raise ValueError(f"`fold_ids` length {len(fold_ids)}" + "is different from input data length" + f"{len(x)}") + + res, train = self._fit_transform(x, y, fold_ids=fold_ids) self.train_encode = res self.train = train self._fitted = True return self - def fit_transform(self, x, y): + def fit_transform(self, x, y, fold_ids=None): """ Simultaneously fit and transform an input This is functionally equivalent to (but faster than) `TargetEncoder().fit(y).transform(y)` + + Parameters + ---------- + x: cudf.Series or cudf.DataFrame or cupy.ndarray + categories to be encoded. It's elements may or may + not be unique + y : cudf.Series or cupy.ndarray + Series containing the target variable. + fold_ids: cudf.Series or cupy.ndarray + Series containing the indices of the customized + folds. Its values should be integers in range + `[0, N-1]` to split data into `N` folds. If None, + fold_ids is generated based on `split_method`. + + Returns + ------- + encoded : cupy.ndarray + The ordinally encoded input series + """ - self.fit(x, y) + self.fit(x, y, fold_ids=fold_ids) return self.train_encode def transform(self, x): @@ -174,7 +214,7 @@ def transform(self, x): test = test.merge(self.encode_all, on=x_cols, how='left') return self._impute_and_sort(test) - def _fit_transform(self, x, y): + def _fit_transform(self, x, y, fold_ids): """ Core function of target encoding """ @@ -185,7 +225,7 @@ def _fit_transform(self, x, y): train[self.y_col] = self._make_y_column(y) self.n_folds = min(self.n_folds, len(train)) - train[self.fold_col] = self._make_fold_column(len(train)) + train[self.fold_col] = self._make_fold_column(len(train), fold_ids) self.mean = train[self.y_col].mean() @@ -237,10 +277,11 @@ def _make_y_column(self, y): "or numpy.ndarray" "or cupy.ndarray") - def _make_fold_column(self, len_train): + def _make_fold_column(self, len_train, fold_ids): """ Create a fold id column for each split_method """ + if self.split == 'random': return cp.random.randint(0, self.n_folds, len_train) elif self.split == 'continuous': @@ -248,6 +289,12 @@ def _make_fold_column(self, len_train): (len_train/self.n_folds)) % self.n_folds elif self.split == 'interleaved': return cp.arange(len_train) % self.n_folds + elif self.split == 'customize': + if fold_ids is None: + raise ValueError("fold_ids can't be None" + "since split_method is set to" + "'customize'.") + return fold_ids else: msg = ("split should be either 'random'" " or 'continuous' or 'interleaved', " diff --git a/python/cuml/test/test_target_encoder.py b/python/cuml/test/test_target_encoder.py index f522cfcd5a..8b389f8dc6 100644 --- a/python/cuml/test/test_target_encoder.py +++ b/python/cuml/test/test_target_encoder.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2021, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -204,3 +204,27 @@ def test_targetencoder_smooth(): train_encoded = encoder.transform(train.category) assert array_equal(train_encoded, answer) + + +def test_targetencoder_customized_fold_id(): + """ + use customized `fold_ids` array to split data. + in this example, the 1st sample belongs to `fold 0` + the 2nd and 3rd sample belongs to `fold 1` + and the 4th sample belongs to `fold 2` + """ + train = cudf.DataFrame({'category': ['a', 'b', 'b', 'a'], + 'label': [1, 0, 1, 1]}) + fold_ids = [0, 1, 1, 2] + encoder = TargetEncoder(split_method='customize') + train_encoded = encoder.fit_transform(train.category, train.label, + fold_ids=fold_ids) + answer = np.array([1., 0.75, 0.75, 1.]) + assert array_equal(train_encoded, answer) + + encoder = TargetEncoder(split_method='customize') + encoder.fit(train.category, train.label, + fold_ids=fold_ids) + train_encoded = encoder.transform(train.category) + + assert array_equal(train_encoded, answer)