microsoft · StrikerRUS · Jan 22, 2022 · Dec 18, 2021 · Dec 18, 2021 · Dec 19, 2021
@@ -3503,7 +3503,20 @@ def predict(self, data, start_iteration=0, num_iteration=None,
                                  raw_score, pred_leaf, pred_contrib,
                                  data_has_header, is_reshape)
 
-    def refit(self, data, label, decay_rate=0.9, **kwargs):
+    def refit(
+        self,
+        data,
+        label,
+        decay_rate=0.9,
+        reference=None,
+        weight=None,
+        group=None,
+        init_score=None,
+        feature_name='auto',
+        categorical_feature='auto',
+        free_raw_data=True,
+        **kwargs
+    ):
         """Refit the existing Booster by new data.
 
         Parameters
@@ -3516,6 +3529,29 @@ def refit(self, data, label, decay_rate=0.9, **kwargs):
         decay_rate : float, optional (default=0.9)
             Decay rate of refit,
             will use ``leaf_output = decay_rate * old_leaf_output + (1.0 - decay_rate) * new_leaf_output`` to refit trees.
+        reference : Dataset or None, optional (default=None)
+            reference for ``data``.
+            If this is Dataset for validation, training data should be used as reference.
+        weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
+            Weight for each ``data`` instance.
+        group : list, numpy 1-D array, pandas Series or None, optional (default=None)
+            Group/query size for ``data``.
+        init_score : list, numpy 1-D array, pandas Series or None, optional (default=None)
+            Init score for ``data``.
+        feature_name : list of strings or 'auto', optional (default="auto")
+            Feature names for ``data``.
+            If 'auto' and data is pandas DataFrame, data columns names are used.
+        categorical_feature : list of strings or int, or 'auto', optional (default="auto")
+            Categorical features for ``data``.
+            If list of int, interpreted as indices.
+            If list of strings, interpreted as feature names (need to specify ``feature_name`` as well).
+            If 'auto' and data is pandas DataFrame, pandas unordered categorical columns are used.
+            All values in categorical features should be less than int32 max value (2147483647).
+            Large values could be memory consuming. Consider using consecutive integers starting from zero.
+            All negative values in categorical features will be treated as missing values.
+            The output cannot be monotonically constrained with respect to a categorical feature.
+         free_raw_data : bool, optional (default=True)
+            If True, raw data is freed after constructing inner Dataset for ``data``.
         **kwargs
             Other parameters for refit.
             These parameters will be passed to ``predict`` method.
@@ -3540,7 +3576,18 @@ def refit(self, data, label, decay_rate=0.9, **kwargs):
             default_value=None
         )
         new_params["linear_tree"] = bool(out_is_linear.value)
-        train_set = Dataset(data, label, params=new_params)
+        train_set = Dataset(
+            data=data,
+            label=label,
+            reference=reference,
+            weight=weight,
+            group=group,
+            init_score=init_score,
+            feature_name=feature_name,
+            categorical_feature=categorical_feature,
+            params=new_params,
+            free_raw_data=free_raw_data,
+        )
         new_params['refit_decay_rate'] = decay_rate
         new_booster = Booster(new_params, train_set)
         # Copy models

@@ -1545,6 +1545,34 @@ def test_refit():
     assert err_pred > new_err_pred
 
 
+def test_refit_dataset_params():
+    # check refit accepts dataset_params
+    X, y = load_breast_cancer(return_X_y=True)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
+    lgb_train = lgb.Dataset(X_train, y_train)
+    params = {
+        'objective': 'binary',
+        'metric': 'binary_logloss',
+        'verbose': -1,
+        'min_data': 10
+    }
+    gbm = lgb.train(params, lgb_train, num_boost_round=20)
+    non_weight_err_pred = log_loss(y_test, gbm.predict(X_test))
+    new_gbm = gbm.refit(
+        data=X_train,
+        label=y_train,
+        weight=np.abs(np.random.normal(size=y_train.shape)),
+        reference=None,
+        group=None,
+        init_score=None,
+        feature_name="auto",
+        categorical_feature="auto",
+        free_raw_data=True
+    )
+    weight_err_pred = log_loss(y_test, new_gbm.predict(X_test))
+    assert weight_err_pred != non_weight_err_pred
+
+
 def test_mape_rf():
     X, y = load_boston(return_X_y=True)
     params = {