From 1cf2b5d38f685a70f9881c2e20ddc07ee1ed826f Mon Sep 17 00:00:00 2001 From: flust <254170321@qq.com> Date: Sun, 17 Jan 2021 07:00:34 +0000 Subject: [PATCH] FEA: Add basic framework of 'lightgbm' and merge xgboost and lightgbm into DecisionTree model(DecisionTreeDataset, DecisionTreeDataLoader) --- recbole/config/configurator.py | 2 +- recbole/data/dataloader/__init__.py | 2 +- ...taloader.py => decisiontree_dataloader.py} | 14 +- recbole/data/dataset/__init__.py | 2 +- ...ost_dataset.py => decisiontree_dataset.py} | 6 +- recbole/data/utils.py | 14 +- recbole/model/exlib_recommender/lightgbm.py | 26 +++ recbole/model/exlib_recommender/xgboost.py | 2 +- recbole/properties/model/lightgbm.yaml | 23 ++ recbole/properties/model/xgboost.yaml | 11 - recbole/trainer/trainer.py | 205 ++++++++++++------ recbole/utils/enum_type.py | 2 +- 12 files changed, 212 insertions(+), 97 deletions(-) rename recbole/data/dataloader/{xgboost_dataloader.py => decisiontree_dataloader.py} (66%) rename recbole/data/dataset/{xgboost_dataset.py => decisiontree_dataset.py} (94%) create mode 100644 recbole/model/exlib_recommender/lightgbm.py create mode 100644 recbole/properties/model/lightgbm.yaml diff --git a/recbole/config/configurator.py b/recbole/config/configurator.py index 04e51adfa..39cd428bc 100644 --- a/recbole/config/configurator.py +++ b/recbole/config/configurator.py @@ -229,7 +229,7 @@ def _load_internal_config_dict(self, model, model_class, dataset): self.internal_config_dict['MODEL_TYPE'] = model_class.type if self.internal_config_dict['MODEL_TYPE'] == ModelType.GENERAL: pass - elif self.internal_config_dict['MODEL_TYPE'] in {ModelType.CONTEXT, ModelType.XGBOOST}: + elif self.internal_config_dict['MODEL_TYPE'] in {ModelType.CONTEXT, ModelType.DECISIONTREE}: self._update_internal_config_dict(context_aware_init) if dataset == 'ml-100k': self._update_internal_config_dict(context_aware_on_ml_100k_init) diff --git a/recbole/data/dataloader/__init__.py b/recbole/data/dataloader/__init__.py index 90ffa311f..b8b2e911f 100644 --- a/recbole/data/dataloader/__init__.py +++ b/recbole/data/dataloader/__init__.py @@ -4,5 +4,5 @@ from recbole.data.dataloader.context_dataloader import * from recbole.data.dataloader.sequential_dataloader import * from recbole.data.dataloader.knowledge_dataloader import * -from recbole.data.dataloader.xgboost_dataloader import * +from recbole.data.dataloader.decisiontree_dataloader import * from recbole.data.dataloader.user_dataloader import * diff --git a/recbole/data/dataloader/xgboost_dataloader.py b/recbole/data/dataloader/decisiontree_dataloader.py similarity index 66% rename from recbole/data/dataloader/xgboost_dataloader.py rename to recbole/data/dataloader/decisiontree_dataloader.py index 17c9ab2b0..996b720a8 100644 --- a/recbole/data/dataloader/xgboost_dataloader.py +++ b/recbole/data/dataloader/decisiontree_dataloader.py @@ -8,7 +8,7 @@ # @Email : 254170321@qq.com """ -recbole.data.dataloader.xgboost_dataloader +recbole.data.dataloader.decisiontree_dataloader ################################################ """ @@ -16,24 +16,24 @@ GeneralFullDataLoader -class XgboostDataLoader(GeneralDataLoader): - """:class:`XgboostDataLoader` is inherit from +class DecisionTreeDataLoader(GeneralDataLoader): + """:class:`DecisionTreeDataLoader` is inherit from :class:`~recbole.data.dataloader.general_dataloader.GeneralDataLoader`, and didn't add/change anything at all. """ pass -class XgboostNegSampleDataLoader(GeneralNegSampleDataLoader): - """:class:`XgboostNegSampleDataLoader` is inherit from +class DecisionTreeNegSampleDataLoader(GeneralNegSampleDataLoader): + """:class:`DecisionTreeNegSampleDataLoader` is inherit from :class:`~recbole.data.dataloader.general_dataloader.GeneralNegSampleDataLoader`, and didn't add/change anything at all. """ pass -class XgboostFullDataLoader(GeneralFullDataLoader): - """:class:`XgboostFullDataLoader` is inherit from +class DecisionTreeFullDataLoader(GeneralFullDataLoader): + """:class:`DecisionTreeFullDataLoader` is inherit from :class:`~recbole.data.dataloader.general_dataloader.GeneralFullDataLoader`, and didn't add/change anything at all. """ diff --git a/recbole/data/dataset/__init__.py b/recbole/data/dataset/__init__.py index 58026c332..739e86e46 100644 --- a/recbole/data/dataset/__init__.py +++ b/recbole/data/dataset/__init__.py @@ -3,5 +3,5 @@ from recbole.data.dataset.kg_dataset import KnowledgeBasedDataset from recbole.data.dataset.social_dataset import SocialDataset from recbole.data.dataset.kg_seq_dataset import Kg_Seq_Dataset -from recbole.data.dataset.xgboost_dataset import XgboostDataset +from recbole.data.dataset.decisiontree_dataset import DecisionTreeDataset from recbole.data.dataset.customized_dataset import * diff --git a/recbole/data/dataset/xgboost_dataset.py b/recbole/data/dataset/decisiontree_dataset.py similarity index 94% rename from recbole/data/dataset/xgboost_dataset.py rename to recbole/data/dataset/decisiontree_dataset.py index eb044b8e4..88d827695 100644 --- a/recbole/data/dataset/xgboost_dataset.py +++ b/recbole/data/dataset/decisiontree_dataset.py @@ -3,7 +3,7 @@ # @Email : 254170321@qq.com """ -recbole.data.xgboost_dataset +recbole.data.decisiontree_dataset ########################## """ @@ -11,8 +11,8 @@ from recbole.utils import FeatureType -class XgboostDataset(Dataset): - """:class:`XgboostDataset` is based on :class:`~recbole.data.dataset.dataset.Dataset`, +class DecisionTreeDataset(Dataset): + """:class:`DecisionTreeDataset` is based on :class:`~recbole.data.dataset.dataset.Dataset`, and Attributes: diff --git a/recbole/data/utils.py b/recbole/data/utils.py index fa8c4abf3..9f0f127c0 100644 --- a/recbole/data/utils.py +++ b/recbole/data/utils.py @@ -45,9 +45,9 @@ def create_dataset(config): elif model_type == ModelType.SOCIAL: from .dataset import SocialDataset return SocialDataset(config) - elif model_type == ModelType.XGBOOST: - from .dataset import XgboostDataset - return XgboostDataset(config) + elif model_type == ModelType.DECISIONTREE: + from .dataset import DecisionTreeDataset + return DecisionTreeDataset(config) else: from .dataset import Dataset return Dataset(config) @@ -283,13 +283,13 @@ def get_data_loader(name, config, eval_setting): return SequentialNegSampleDataLoader elif neg_sample_strategy == 'full': return SequentialFullDataLoader - elif model_type == ModelType.XGBOOST: + elif model_type == ModelType.DECISIONTREE: if neg_sample_strategy == 'none': - return XgboostDataLoader + return DecisionTreeDataLoader elif neg_sample_strategy == 'by': - return XgboostNegSampleDataLoader + return DecisionTreeNegSampleDataLoader elif neg_sample_strategy == 'full': - return XgboostFullDataLoader + return DecisionTreeFullDataLoader elif model_type == ModelType.KNOWLEDGE: if neg_sample_strategy == 'by': if name == 'train': diff --git a/recbole/model/exlib_recommender/lightgbm.py b/recbole/model/exlib_recommender/lightgbm.py new file mode 100644 index 000000000..28ef5d9a2 --- /dev/null +++ b/recbole/model/exlib_recommender/lightgbm.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# @Time : 2020/1/17 +# @Author : Chen Yang +# @Email : 254170321@qq.com + +r""" +recbole.model.exlib_recommender.lightgbm +############################# +""" + +import lightgbm as lgb +from recbole.utils import ModelType, InputType + + +class lightgbm(lgb.Booster): + r"""lightgbm is inherited from lgb.Booster + + """ + type = ModelType.DECISIONTREE + input_type = InputType.POINTWISE + + def __init__(self, config, dataset): + super(lgb.Booster, self).__init__() + + def to(self, device): + return self diff --git a/recbole/model/exlib_recommender/xgboost.py b/recbole/model/exlib_recommender/xgboost.py index a09da2cdb..46f6c696a 100644 --- a/recbole/model/exlib_recommender/xgboost.py +++ b/recbole/model/exlib_recommender/xgboost.py @@ -16,7 +16,7 @@ class xgboost(xgb.Booster): r"""xgboost is inherited from xgb.Booster """ - type = ModelType.XGBOOST + type = ModelType.DECISIONTREE input_type = InputType.POINTWISE def __init__(self, config, dataset): diff --git a/recbole/properties/model/lightgbm.yaml b/recbole/properties/model/lightgbm.yaml new file mode 100644 index 000000000..d5cb8ca03 --- /dev/null +++ b/recbole/properties/model/lightgbm.yaml @@ -0,0 +1,23 @@ +convert_token_to_onehot: False +token_num_threshold: 10000 + +# Dataset +lgb_silent: False + +# Train +lgb_model: ~ +lgb_params: + boosting: gbdt + num_leaves: 90 + min_data_in_leaf: 30 + max_depth: -1 + learning_rate: 0.1 + objective: binary + lambda_l1: 0.1 + metric: ['auc', 'binary_logloss'] + force_row_wise: True +lgb_learning_rates: ~ +lgb_num_boost_round: 300 +lgb_early_stopping_rounds: ~ +lgb_verbose_eval: 100 + diff --git a/recbole/properties/model/xgboost.yaml b/recbole/properties/model/xgboost.yaml index 6a6620bf1..47d4f890f 100644 --- a/recbole/properties/model/xgboost.yaml +++ b/recbole/properties/model/xgboost.yaml @@ -1,14 +1,8 @@ -# Type of training method convert_token_to_onehot: False token_num_threshold: 10000 # DMatrix -xgb_weight: ~ -xgb_base_margin: ~ -xgb_missing: ~ xgb_silent: ~ -xgb_feature_names: ~ -xgb_feature_types: ~ xgb_nthread: ~ xgb_model: ~ @@ -26,11 +20,6 @@ xgb_params: seed: 2020 # nthread: -1 xgb_num_boost_round: 500 -# xgb_evals: ~ -xgb_obj: ~ -xgb_feval: ~ -xgb_maximize: ~ xgb_early_stopping_rounds: ~ -# xgb_evals_result: ~ xgb_verbose_eval: 100 diff --git a/recbole/trainer/trainer.py b/recbole/trainer/trainer.py index bfd761660..335407710 100644 --- a/recbole/trainer/trainer.py +++ b/recbole/trainer/trainer.py @@ -584,42 +584,17 @@ def __init__(self, config, model): self.epochs = 1 # Set the epoch to 1 when running memory based model -class xgboostTrainer(AbstractTrainer): - """xgboostTrainer is designed for XGBOOST. +class DecisionTreeTrainer(AbstractTrainer): + """DecisionTreeTrainer is designed for DecisionTree model. - """ - + """ def __init__(self, config, model): - super(xgboostTrainer, self).__init__(config, model) - - self.xgb = __import__('xgboost') + super(DecisionTreeTrainer, self).__init__(config, model) self.logger = getLogger() self.label_field = config['LABEL_FIELD'] - self.xgb_model = config['xgb_model'] self.convert_token_to_onehot = self.config['convert_token_to_onehot'] - # DMatrix params - self.weight = config['xgb_weight'] - self.base_margin = config['xgb_base_margin'] - self.missing = config['xgb_missing'] - self.silent = config['xgb_silent'] - self.feature_names = config['xgb_feature_names'] - self.feature_types = config['xgb_feature_types'] - self.nthread = config['xgb_nthread'] - - # train params - self.params = config['xgb_params'] - self.num_boost_round = config['xgb_num_boost_round'] - self.evals = () - self.obj = config['xgb_obj'] - self.feval = config['xgb_feval'] - self.maximize = config['xgb_maximize'] - self.early_stopping_rounds = config['xgb_early_stopping_rounds'] - self.evals_result = {} - self.verbose_eval = config['xgb_verbose_eval'] - self.callbacks = None - # evaluator self.eval_type = config['eval_type'] self.epochs = config['epochs'] @@ -634,13 +609,14 @@ def __init__(self, config, model): saved_model_file = '{}-{}.pth'.format(self.config['model'], get_local_time()) self.saved_model_file = os.path.join(self.checkpoint_dir, saved_model_file) - def _interaction_to_DMatrix(self, dataloader): - r"""Convert data format from interaction to DMatrix + def _interaction_to_sparse(self, dataloader): + r"""Convert data format from interaction to sparse or numpy Args: - dataloader (XgboostDataLoader): xgboost dataloader. + dataloader (DecisionTreeDataLoader): DecisionTreeDataLoader dataloader. Returns: - DMatrix: Data in the form of 'DMatrix'. + cur_data (sparse or numpy): data. + interaction_np[self.label_field] (numpy): label. """ interaction = dataloader.dataset[:] interaction_np = interaction.numpy() @@ -682,39 +658,16 @@ def _interaction_to_DMatrix(self, dataloader): cur_data = sparse.csc_matrix(onehot_data) - return self.xgb.DMatrix(data=cur_data, - label=interaction_np[self.label_field], - weight=self.weight, - base_margin=self.base_margin, - missing=self.missing, - silent=self.silent, - feature_names=self.feature_names, - feature_types=self.feature_types, - nthread=self.nthread) - - def _train_at_once(self, train_data, valid_data): - r""" - - Args: - train_data (XgboostDataLoader): XgboostDataLoader, which is the same with GeneralDataLoader. - valid_data (XgboostDataLoader): XgboostDataLoader, which is the same with GeneralDataLoader. - """ - self.dtrain = self._interaction_to_DMatrix(train_data) - self.dvalid = self._interaction_to_DMatrix(valid_data) - self.evals = [(self.dtrain, 'train'), (self.dvalid, 'valid')] - self.model = self.xgb.train(self.params, self.dtrain, self.num_boost_round, - self.evals, self.obj, self.feval, self.maximize, - self.early_stopping_rounds, self.evals_result, - self.verbose_eval, self.xgb_model, self.callbacks) - - self.model.save_model(self.saved_model_file) - self.xgb_model = self.saved_model_file + return cur_data, interaction_np[self.label_field] + + def _interaction_to_lib_datatype(self, dataloader): + pass def _valid_epoch(self, valid_data): r""" Args: - valid_data (XgboostDataLoader): XgboostDataLoader, which is the same with GeneralDataLoader. + valid_data (DecisionTreeDataLoader): DecisionTreeDataLoader, which is the same with GeneralDataLoader. """ valid_result = self.evaluate(valid_data) valid_score = calculate_valid_score(valid_result, self.valid_metric) @@ -722,8 +675,8 @@ def _valid_epoch(self, valid_data): def fit(self, train_data, valid_data=None, verbose=True, saved=True, show_progress=False): # load model - if self.xgb_model is not None: - self.model.load_model(self.xgb_model) + if self.boost_model is not None: + self.model.load_model(self.boost_model) self.best_valid_score = 0. self.best_valid_result = 0. @@ -748,14 +701,138 @@ def fit(self, train_data, valid_data=None, verbose=True, saved=True, show_progre return self.best_valid_score, self.best_valid_result + def evaluate(self): + pass + + +class xgboostTrainer(DecisionTreeTrainer): + """xgboostTrainer is designed for XGBOOST. + + """ + + def __init__(self, config, model): + super(xgboostTrainer, self).__init__(config, model) + + self.xgb = __import__('xgboost') + self.boost_model = config['xgb_model'] + self.silent = config['xgb_silent'] + self.nthread = config['xgb_nthread'] + + # train params + self.params = config['xgb_params'] + self.num_boost_round = config['xgb_num_boost_round'] + self.evals = () + self.early_stopping_rounds = config['xgb_early_stopping_rounds'] + self.evals_result = {} + self.verbose_eval = config['xgb_verbose_eval'] + self.callbacks = None + + def _interaction_to_lib_datatype(self, dataloader): + r"""Convert data format from interaction to DMatrix + + Args: + dataloader (DecisionTreeDataLoader): xgboost dataloader. + Returns: + DMatrix: Data in the form of 'DMatrix'. + """ + data, label = self._interaction_to_sparse(dataloader) + return self.xgb.DMatrix(data = data, label = label, silent = self.silent, nthread = self.nthread) + + def _train_at_once(self, train_data, valid_data): + r""" + + Args: + train_data (DecisionTreeDataLoader): DecisionTreeDataLoader, which is the same with GeneralDataLoader. + valid_data (DecisionTreeDataLoader): DecisionTreeDataLoader, which is the same with GeneralDataLoader. + """ + self.dtrain = self._interaction_to_lib_datatype(train_data) + self.dvalid = self._interaction_to_lib_datatype(valid_data) + self.evals = [(self.dtrain, 'train'), (self.dvalid, 'valid')] + self.model = self.xgb.train(self.params, self.dtrain, self.num_boost_round, self.evals, + early_stopping_rounds = self.early_stopping_rounds, + evals_result = self.evals_result, + verbose_eval = self.verbose_eval, + xgb_model = self.boost_model, + callbacks = self.callbacks) + + self.model.save_model(self.saved_model_file) + self.boost_model = self.saved_model_file + def evaluate(self, eval_data, load_best_model=True, model_file=None, show_progress=False): self.eval_pred = torch.Tensor() self.eval_true = torch.Tensor() - self.deval = self._interaction_to_DMatrix(eval_data) + self.deval = self._interaction_to_lib_datatype(eval_data) self.eval_true = torch.Tensor(self.deval.get_label()) self.eval_pred = torch.Tensor(self.model.predict(self.deval)) batch_matrix_list = [[torch.stack((self.eval_true, self.eval_pred), 1)]] result = self.evaluator.evaluate(batch_matrix_list, eval_data) return result + + +class lightgbmTrainer(DecisionTreeTrainer): + """lightgbmTrainer is designed for lightgbm. + + """ + + def __init__(self, config, model): + super(lightgbmTrainer, self).__init__(config, model) + + self.lgb = __import__('lightgbm') + self.boost_model = config['lgb_model'] + self.silent = config['lgb_silent'] + + # train params + self.params = config['lgb_params'] + self.num_boost_round = config['lgb_num_boost_round'] + self.evals = () + self.early_stopping_rounds = config['lgb_early_stopping_rounds'] + self.evals_result = {} + self.verbose_eval = config['lgb_verbose_eval'] + self.learning_rates = config['lgb_learning_rates'] + self.callbacks = None + + def _interaction_to_lib_datatype(self, dataloader): + r"""Convert data format from interaction to Dataset + + Args: + dataloader (DecisionTreeDataLoader): xgboost dataloader. + Returns: + dataset(lgb.Dataset): Data in the form of 'lgb.Dataset'. + """ + data, label = self._interaction_to_sparse(dataloader) + return self.lgb.Dataset(data = data, label = label, silent = self.silent) + + def _train_at_once(self, train_data, valid_data): + r""" + + Args: + train_data (DecisionTreeDataLoader): DecisionTreeDataLoader, which is the same with GeneralDataLoader. + valid_data (DecisionTreeDataLoader): DecisionTreeDataLoader, which is the same with GeneralDataLoader. + """ + self.dtrain = self._interaction_to_lib_datatype(train_data) + self.dvalid = self._interaction_to_lib_datatype(valid_data) + self.evals = [self.dtrain, self.dvalid] + self.model = self.lgb.train(self.params, self.dtrain, self.num_boost_round, self.evals, + early_stopping_rounds = self.early_stopping_rounds, + evals_result = self.evals_result, + verbose_eval = self.verbose_eval, + learning_rates = self.learning_rates, + init_model = self.boost_model, + callbacks = self.callbacks) + + self.model.save_model(self.saved_model_file) + self.boost_model = self.saved_model_file + + def evaluate(self, eval_data, load_best_model=True, model_file=None, show_progress=False): + self.eval_pred = torch.Tensor() + self.eval_true = torch.Tensor() + + self.deval_data, self.deval_label = self._interaction_to_sparse(eval_data) + self.eval_true = torch.Tensor(self.deval_label) + self.eval_pred = torch.Tensor(self.model.predict(self.deval_data)) + + batch_matrix_list = [[torch.stack((self.eval_true, self.eval_pred), 1)]] + result = self.evaluator.evaluate(batch_matrix_list, eval_data) + return result diff --git a/recbole/utils/enum_type.py b/recbole/utils/enum_type.py index 84e15b812..ebc1faea7 100644 --- a/recbole/utils/enum_type.py +++ b/recbole/utils/enum_type.py @@ -26,7 +26,7 @@ class ModelType(Enum): KNOWLEDGE = 4 SOCIAL = 5 TRADITIONAL = 6 - XGBOOST = 7 + DECISIONTREE = 7 class DataLoaderType(Enum):