diff --git a/conda/meta.yaml b/conda/meta.yaml index 7963dba0b..9c39b5bfe 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -9,26 +9,26 @@ requirements: build: - python host: - - python + - python >=3.6 - numpy >=1.17.2 - scipy ==1.6.0 - pandas >=1.0.5 - tqdm >=4.48.2 - pyyaml >=5.1.0 - scikit-learn >=0.23.2 - - pytorch + - pytorch >=1.7.0 - colorlog==4.7.2 - colorama==0.4.4 - tensorboard >=2.5.0 run: - - python + - python >=3.6 - numpy >=1.17.2 - scipy ==1.6.0 - pandas >=1.0.5 - tqdm >=4.48.2 - pyyaml >=5.1.0 - scikit-learn >=0.23.2 - - pytorch + - pytorch >=1.7.0 - colorlog==4.7.2 - colorama==0.4.4 - tensorboard >=2.5.0 diff --git a/docs/source/asset/framework.png b/docs/source/asset/framework.png new file mode 100644 index 000000000..add0b8028 Binary files /dev/null and b/docs/source/asset/framework.png differ diff --git a/docs/source/asset/logo.png b/docs/source/asset/logo.png new file mode 100644 index 000000000..047e61bcf Binary files /dev/null and b/docs/source/asset/logo.png differ diff --git a/docs/source/asset/tensorboard_1.png b/docs/source/asset/tensorboard_1.png new file mode 100644 index 000000000..1a48d6dcd Binary files /dev/null and b/docs/source/asset/tensorboard_1.png differ diff --git a/docs/source/asset/tensorboard_2.png b/docs/source/asset/tensorboard_2.png new file mode 100644 index 000000000..3a8a19e98 Binary files /dev/null and b/docs/source/asset/tensorboard_2.png differ diff --git a/docs/source/developer_guide/customize_samplers.rst b/docs/source/developer_guide/customize_samplers.rst index 3c37797cc..ebfa414bd 100644 --- a/docs/source/developer_guide/customize_samplers.rst +++ b/docs/source/developer_guide/customize_samplers.rst @@ -1,8 +1,22 @@ Customize Samplers ====================== +In RecBole, sampler module is designed to select negative items for training and evaluation. + Here we present how to develop a new sampler, and apply it into RecBole. The new sampler is used when we need complex sampling method. +In RecBole, we now only support two kinds of sampling strategies: **random negative sampling (RNS)** and **popularity-biased negative sampling (PNS)**. +RNS is to select the negative items in uniform distribution, and PNS is to select the negative item in a popularity-biased distribution. +For PNS, we set the popularity-biased distribution based on the total number of items' interactions. + +In our framework, if you want to create a new sampler, you need to inherit the :class:`~recbole.sampler.sampler.AbstractSampler`, implement +:obj:`__init__()`, , +:meth:`~recbole.sampler.sampler.KGSampler.__init__()`, +, rewrite three functions: :obj: `_uni_sampling()`, +:obj: `._get_candidates_list()`, :obj: `get_used_ids()` +and create a new sampling function. + + Here, we take the :class:`~recbole.sampler.sampler.KGSampler` as an example. @@ -37,35 +51,36 @@ where we only need to invoke :obj:`super.__init__(distribution)`. super().__init__(distribution=distribution) +Implement _uni_sampling() +------------------------------- +To implement the RNS for KGSampler, we need to rewrite the `:meth:`~recbole.sampler.sampler.AbstractSampler._uni_sampling`. +Here we use the :obj:`numpy.random.randint()` to help us randomly select the ``entity_id``. This function will return the +selected samples' id (here is ``entity_id``). -Implement get_random_list() ------------------------------- -We do not use the random function in python or numpy due to their lower efficiency. -Instead, we realize our own :meth:`~recbole.sampler.sampler.AbstractSampler.random` function, where the key method is to combine the random list with the pointer. -The pointer point to some element in the random list. When one calls :meth:`self.random`, the element is returned, and moves the pointer backward by one element. -If the pointer point to the last element, then it will return to the head of the element. +Example code: + +.. code:: python -In :class:`~recbole.sampler.sampler.AbstractSampler`, the :meth:`~recbole.sampler.sampler.AbstractSampler.__init__` will call :meth:`~recbole.sampler.sampler.AbstractSampler.get_random_list`, and shuffle the results. -We only need to return a list including all the elements. + def _uni_sampling(self, sample_num): + return np.random.randint(1, self.entity_num, sample_num) -It should be noted ``0`` can be the token used for padding, thus one should remain this value. +Implement _get_candidates_list() +------------------------------------- +To implement PNS for KGSampler, we need to rewrite the `:meth:`~recbole.sampler.sampler.AbstractSampler._get_candidates_list`. +This function is used to get a candidate list for PNS, and we will set the sampling distribution based on +:obj:`Counter(candidate_list)`. This function will return a list of candidates' id. Example code: -.. code:: python +..code:: python - def get_random_list(self): - if self.distribution == 'uniform': - return list(range(1, self.entity_num)) - elif self.distribution == 'popularity': - return list(self.hid_list) + list(self.tid_list) - else: - raise NotImplementedError('Distribution [{}] has not been implemented'.format(self.distribution)) + def _get_candidates_list(self): + return list(self.hid_list) + list(self.tid_list) Implement get_used_ids() ---------------------------- -For negative sampling, we do not want to sample positive instance, this function is used to compute the positive sample. +For negative sampling, we do not want to sample positive instance, this function is used to record the positive sample. The function will return numpy, and the index is the ID. The return value will be saved in :attr:`self.used_ids`. Example code: @@ -73,13 +88,20 @@ Example code: .. code:: python def get_used_ids(self): - used_tail_entity_id = np.array([set() for i in range(self.entity_num)]) + used_tail_entity_id = np.array([set() for _ in range(self.entity_num)]) for hid, tid in zip(self.hid_list, self.tid_list): used_tail_entity_id[hid].add(tid) + + for used_tail_set in used_tail_entity_id: + if len(used_tail_set) + 1 == self.entity_num: # [pad] is a entity. + raise ValueError( + 'Some head entities have relation with all entities, ' + 'which we can not sample negative entities for them.' + ) return used_tail_entity_id -Implementing the sampling function +Implement the sampling function ----------------------------------- In :class:`~recbole.sampler.sampler.AbstractSampler`, we have implemented :meth:`~recbole.sampler.sampler.AbstractSampler.sample_by_key_ids` function, where we have three parameters: :attr:`key_ids`, :attr:`num` and :attr:`used_ids`. @@ -109,12 +131,6 @@ Complete Code .. code:: python class KGSampler(AbstractSampler): - """:class:`KGSampler` is used to sample negative entities in a knowledge graph. - - Args: - dataset (Dataset): The knowledge graph dataset, which contains triplets in a knowledge graph. - distribution (str, optional): Distribution of the negative entities. Defaults to 'uniform'. - """ def __init__(self, dataset, distribution='uniform'): self.dataset = dataset @@ -128,47 +144,31 @@ Complete Code super().__init__(distribution=distribution) - def get_random_list(self): - """ - Returns: - np.ndarray or list: Random list of entity_id. - """ - if self.distribution == 'uniform': - return list(range(1, self.entity_num)) - elif self.distribution == 'popularity': - return list(self.hid_list) + list(self.tid_list) - else: - raise NotImplementedError('Distribution [{}] has not been implemented'.format(self.distribution)) + def _uni_sampling(self, sample_num): + return np.random.randint(1, self.entity_num, sample_num) + + def _get_candidates_list(self): + return list(self.hid_list) + list(self.tid_list) def get_used_ids(self): - """ - Returns: - np.ndarray: Used entity_ids is the same as tail_entity_ids in knowledge graph. - Index is head_entity_id, and element is a set of tail_entity_ids. - """ - used_tail_entity_id = np.array([set() for i in range(self.entity_num)]) + used_tail_entity_id = np.array([set() for _ in range(self.entity_num)]) for hid, tid in zip(self.hid_list, self.tid_list): used_tail_entity_id[hid].add(tid) + + for used_tail_set in used_tail_entity_id: + if len(used_tail_set) + 1 == self.entity_num: # [pad] is a entity. + raise ValueError( + 'Some head entities have relation with all entities, ' + 'which we can not sample negative entities for them.' + ) return used_tail_entity_id def sample_by_entity_ids(self, head_entity_ids, num=1): - """Sampling by head_entity_ids. - - Args: - head_entity_ids (np.ndarray or list): Input head_entity_ids. - num (int, optional): Number of sampled entity_ids for each head_entity_id. Defaults to ``1``. - - Returns: - np.ndarray: Sampled entity_ids. - entity_ids[0], entity_ids[len(head_entity_ids)], entity_ids[len(head_entity_ids) * 2], ..., - entity_id[len(head_entity_ids) * (num - 1)] is sampled for head_entity_ids[0]; - entity_ids[1], entity_ids[len(head_entity_ids) + 1], entity_ids[len(head_entity_ids) * 2 + 1], ..., - entity_id[len(head_entity_ids) * (num - 1) + 1] is sampled for head_entity_ids[1]; ...; and so on. - """ try: - return self.sample_by_key_ids(head_entity_ids, num, self.used_ids[head_entity_ids]) + return self.sample_by_key_ids(head_entity_ids, num) except IndexError: for head_entity_id in head_entity_ids: if head_entity_id not in self.head_entities: - raise ValueError('head_entity_id [{}] not exist'.format(head_entity_id)) + raise ValueError(f'head_entity_id [{head_entity_id}] not exist.') + diff --git a/docs/source/get_started/install.rst b/docs/source/get_started/install.rst index b798defcc..f2a8bcdeb 100644 --- a/docs/source/get_started/install.rst +++ b/docs/source/get_started/install.rst @@ -11,7 +11,7 @@ RecBole is compatible with the following operating systems: * Windows 10 * macOS X -Python 3.6 (or later), torch 1.6.0 (or later) are required to install our library. If you want to use RecBole with GPU, +Python 3.6 (or later), torch 1.7.0 (or later) are required to install our library. If you want to use RecBole with GPU, please ensure that CUDA or CUDAToolkit version is 9.2 or later. This requires NVIDIA driver version >= 396.26 (for Linux) or >= 397.44 (for Windows10). @@ -54,3 +54,62 @@ Run the following command to install: .. code:: bash pip install -e . --verbose + +Try to run: +------------------------- +To check if you have successfully installed the RecBole, you can create a new python file (e.g., `run.py`), +and write the following code: + +.. code:: python + + from recbole.quick_start import run_recbole + + run_recbole(model='BPR', dataset='ml-100k') + + +Then run the following command: + +.. code:: bash + + python run.py + +This will perform the training and test of the BPR model on the ml-100k dataset, and you will obtain some output like: + +.. code:: none + + 05 Aug 02:16 INFO ml-100k + The number of users: 944 + Average actions of users: 106.04453870625663 + The number of items: 1683 + Average actions of items: 59.45303210463734 + The number of inters: 100000 + The sparsity of the dataset: 93.70575143257098% + Remain Fields: ['user_id', 'item_id', 'rating', 'timestamp'] + 05 Aug 02:16 INFO [Training]: train_batch_size = [2048] negative sampling: [{'uniform': 1}] + 05 Aug 02:16 INFO [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}] + 05 Aug 02:16 INFO BPR( + (user_embedding): Embedding(944, 64) + (item_embedding): Embedding(1683, 64) + (loss): BPRLoss() + ) + Trainable parameters: 168128 + Train 0: 100%|████████████████████████| 40/40 [00:00<00:00, 219.54it/s, GPU RAM: 0.01 G/11.91 G] + 05 Aug 02:16 INFO epoch 0 training [time: 0.19s, train loss: 27.7228] + Evaluate : 100%|██████████████████████| 472/472 [00:00<00:00, 506.11it/s, GPU RAM: 0.01 G/11.91 G] + 05 Aug 02:16 INFO epoch 0 evaluating [time: 0.94s, valid_score: 0.020500] + 05 Aug 02:16 INFO valid result: + recall@10 : 0.0067 mrr@10 : 0.0205 ndcg@10 : 0.0086 hit@10 : 0.0732 precision@10 : 0.0081 + + ... + + Train 96: 100%|████████████████████████| 40/40 [00:00<00:00, 230.65it/s, GPU RAM: 0.01 G/11.91 G] + 05 Aug 02:19 INFO epoch 96 training [time: 0.18s, train loss: 3.7170] + Evaluate : 100%|██████████████████████| 472/472 [00:00<00:00, 800.46it/s, GPU RAM: 0.01 G/11.91 G] + 05 Aug 02:19 INFO epoch 96 evaluating [time: 0.60s, valid_score: 0.375200] + 05 Aug 02:19 INFO valid result: + recall@10 : 0.2162 mrr@10 : 0.3752 ndcg@10 : 0.2284 hit@10 : 0.7508 precision@10 : 0.1602 + 05 Aug 02:19 INFO Finished training, best eval result in epoch 85 + 05 Aug 02:19 INFO Loading model structure and parameters from saved/BPR-Aug-05-2021_02-17-51.pth + Evaluate : 100%|██████████████████████| 472/472 [00:00<00:00, 832.85it/s, GPU RAM: 0.01 G/11.91 G] + 05 Aug 02:19 INFO best valid : {'recall@10': 0.2195, 'mrr@10': 0.3871, 'ndcg@10': 0.2344, 'hit@10': 0.7582, 'precision@10': 0.1627} + 05 Aug 02:19 INFO test result: {'recall@10': 0.2523, 'mrr@10': 0.4855, 'ndcg@10': 0.292, 'hit@10': 0.7953, 'precision@10': 0.1962} \ No newline at end of file diff --git a/docs/source/get_started/introduction.rst b/docs/source/get_started/introduction.rst deleted file mode 100644 index d17c56e20..000000000 --- a/docs/source/get_started/introduction.rst +++ /dev/null @@ -1,31 +0,0 @@ -Introduction -============== - -RecBole is a unified, comprehensive and efficient framework developed based on PyTorch. -It aims to help the researchers to reproduce and develop recommendation models. - -In the first release, our library includes 73 recommendation algorithms `[Model List]`_, covering four major categories: - -- General Recommendation -- Sequential Recommendation -- Context-aware Recommendation -- Knowledge-based Recommendation - -We design a unified and flexible data file format, and provide the support for 28 benchmark recommendation datasets `[Collected Datasets]`_. A user can apply the provided script to process the original data copy, or simply download the processed datasets by our team. - -Features: - -- General and extensible data structure - We deign general and extensible data structures to unify the formatting and usage of various recommendation datasets. -- Comprehensive benchmark models and datasets - We implement 73 commonly used recommendation algorithms, and provide the formatted copies of 28 recommendation datasets. -- Efficient GPU-accelerated execution - We design many tailored strategies in the GPU environment to enhance the efficiency of our library. -- Extensive and standard evaluation protocols - We support a series of commonly used evaluation protocols or settings for testing and comparing recommendation algorithms. - -.. _[Collected Datasets]: - /dataset_list.html - -.. _[Model List]: - /model_list.html diff --git a/docs/source/get_started/quick_start.rst b/docs/source/get_started/quick_start.rst index 9a97a8a15..d65466132 100644 --- a/docs/source/get_started/quick_start.rst +++ b/docs/source/get_started/quick_start.rst @@ -1,112 +1,183 @@ Quick Start =============== -Here is a quick-start example for using RecBole. +Here is a quick-start example for using RecBole. We will show you how to train and test **BPR** model on the **ml-100k** dataset from both **API** +and **source code**. -Quick-start From Source + +Quick-start From API -------------------------- -With the source code of `RecBole `_, -the following script can be used to run a toy example of our library. -.. code:: bash +1. Prepare your data: +>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +Before running a model, firstly you need to prepare and load data. To help users quickly get start, +RecBole has a build-in dataset **ml-100k** and you can directly use it. However, if you want to use other datasets, you can read +.doc.`../usage/running_new_dataset` for more information. - python run_recbole.py +Then, you need to set data config for data loading. You can create a `yaml` file called `test.yaml` and write the following settings: -This script will run the BPR model on the ml-100k dataset. +.. code:: yaml -Typically, this example takes less than one minute. We will obtain some output like: + # dataset config + USER_ID_FIELD: user_id + ITEM_ID_FIELD: item_id + load_col: + inter: [user_id, item_id] -.. code:: none +For more details of data config, please refer to :doc:`../user_guide/config/data_settings`. - INFO ml-100k - The number of users: 944 - Average actions of users: 106.04453870625663 - The number of items: 1683 - Average actions of items: 59.45303210463734 - The number of inters: 100000 - The sparsity of the dataset: 93.70575143257098% +2. Choose a model: +>>>>>>>>>>>>>>>>>>>>>>>>> +In RecBole, we implement 73 recommendation models covering general recommendation, sequential recommendation, +context-aware recommendation and knowledge-based recommendation. You can choose a model from our :doc:`../user_guide/model_intro`. +Here we choose BPR model to train and test. - INFO Evaluation Settings: - Group by user_id - Ordering: {'strategy': 'shuffle'} - Splitting: {'strategy': 'by_ratio', 'ratios': [0.8, 0.1, 0.1]} - Negative Sampling: {'strategy': 'full', 'distribution': 'uniform'} +Then, you need to set the parameter for BPR model. You can check the :doc:`../user_guide/model/general/bpr` and add the model settings into the `test.yaml`, like: - INFO BPRMF( - (user_embedding): Embedding(944, 64) - (item_embedding): Embedding(1683, 64) - (loss): BPRLoss() - ) - Trainable parameters: 168128 +.. code:: yaml - INFO epoch 0 training [time: 0.27s, train loss: 27.7231] - INFO epoch 0 evaluating [time: 0.12s, valid_score: 0.021900] - INFO valid result: - recall@10: 0.0073 mrr@10: 0.0219 ndcg@10: 0.0093 hit@10: 0.0795 precision@10: 0.0088 + # model config + embedding_size: 64 - ... +If you want to run different models, you can read :doc:`../user_guide/usage/running_different_models` for more information. - INFO epoch 63 training [time: 0.19s, train loss: 4.7660] - INFO epoch 63 evaluating [time: 0.08s, valid_score: 0.394500] - INFO valid result: - recall@10: 0.2156 mrr@10: 0.3945 ndcg@10: 0.2332 hit@10: 0.7593 precision@10: 0.1591 +3. Set training and evaluation config: +>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +In RecBole, we support multiple training and evaluation methods. You can choose how to train and test model by simply setting the config. - INFO Finished training, best eval result in epoch 52 - INFO Loading model structure and parameters from saved/***.pth - INFO best valid result: - recall@10: 0.2169 mrr@10: 0.4005 ndcg@10: 0.235 hit@10: 0.7582 precision@10: 0.1598 - INFO test result: - recall@10: 0.2368 mrr@10: 0.4519 ndcg@10: 0.2768 hit@10: 0.7614 precision@10: 0.1901 +Here we want to train and test the BPR model in training-validation-test method (optimize model parameters on the training set, do parameter selection according to the results on the validation set, +and finally report the results on the test set) and evaluate the model performance by full ranking with all item candidates, +so we can add the following settings into the `test.yaml`. -Note that using the quick start pipeline we provide, the original dataset will be divided into training set, validation set and test set by default. -We optimize model parameters on the training set, do parameter selection according to the results on the validation set, -and finally report the results on the test set. +.. code:: yaml -If you want to change the parameters, such as ``learning_rate``, ``embedding_size``, -just set the additional command parameters as you need: + # Training and evaluation config + epochs: 500 + train_batch_size: 4096 + eval_batch_size: 4096 + neg_sampling: + uniform: 1 + eval_args: + group_by: user + order: RO + split: {'RS': [0.8,0.1,0.1]} + mode: full + metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision'] + topk: 10 + valid_metric: MRR@10 + metric_decimal_place: 4 -.. code:: bash +For more details of training and evaluation config, please refer to :doc:`../user_guide/config/training_settings` and :doc:`../user_guide/config/evaluation_settings`. - python run_recbole.py --learning_rate=0.0001 --embedding_size=128 +4. Run the model and collect the result +>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +Now you have finished all the preparations, it's time to run the model! +You can create a new python file (e.g., `run.py`), and write the following code: -If you want to change the models, just run the script by setting additional command parameters: +.. code:: python -.. code:: bash + from recbole.quick_start import run_recbole - python run_recbole.py --model=[model_name] + run_recbole(model='BPR', dataset='ml-100k') -``model_name`` indicates the model to be initialized. -RecBole has implemented four categories of recommendation algorithms -including general recommendation, context-aware recommendation, -sequential recommendation and knowledge-based recommendation. -More details can be found in :doc:`../user_guide/model_intro`. +Then run the following command: -The datasets can be changed according to :doc:`../user_guide/data_intro`. +.. code:: bash + python run.py --config_files=test.yaml -Quick-start From API -------------------------- -If RecBole is installed from ``pip`` or ``conda``, you can create a new python file (e.g., `run.py`), -and write the following code: +And you will obtain the output like: -.. code:: python +.. code:: none - from recbole.quick_start import run_recbole + 24 Aug 01:46 INFO ml-100k + The number of users: 944 + Average actions of users: 106.04453870625663 + The number of items: 1683 + Average actions of items: 59.45303210463734 + The number of inters: 100000 + The sparsity of the dataset: 93.70575143257098% + Remain Fields: ['user_id', 'item_id', 'rating', 'timestamp'] + 24 Aug 01:46 INFO [Training]: train_batch_size = [2048] negative sampling: [{'uniform': 1}] + 24 Aug 01:46 INFO [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [0.8, 0.1, 0.1]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}] + 24 Aug 01:46 INFO BPR( + (user_embedding): Embedding(944, 64) + (item_embedding): Embedding(1683, 64) + (loss): BPRLoss() + ) + Trainable parameters: 168128 + Train 0: 100%|████████████████████████| 40/40 [00:00<00:00, 200.47it/s, GPU RAM: 0.01 G/11.91 G] + 24 Aug 01:46 INFO epoch 0 training [time: 0.21s, train loss: 27.7228] + Evaluate : 100%|██████████████████████| 472/472 [00:00<00:00, 518.65it/s, GPU RAM: 0.01 G/11.91 G] + 24 Aug 01:46 INFO epoch 0 evaluating [time: 0.92s, valid_score: 0.020500] + ...... + Train 96: 100%|████████████████████████| 40/40 [00:00<00:00, 229.26it/s, GPU RAM: 0.01 G/11.91 G] + 24 Aug 01:47 INFO epoch 96 training [time: 0.18s, train loss: 3.7170] + Evaluate : 100%|██████████████████████| 472/472 [00:00<00:00, 857.00it/s, GPU RAM: 0.01 G/11.91 G] + 24 Aug 01:47 INFO epoch 96 evaluating [time: 0.56s, valid_score: 0.375200] + 24 Aug 01:47 INFO valid result: + recall@10 : 0.2162 mrr@10 : 0.3752 ndcg@10 : 0.2284 hit@10 : 0.7508 precision@10 : 0.1602 + 24 Aug 01:47 INFO Finished training, best eval result in epoch 85 + 24 Aug 01:47 INFO Loading model structure and parameters from saved/BPR-Aug-24-2021_01-46-43.pth + Evaluate : 100%|██████████████████████| 472/472 [00:00<00:00, 866.53it/s, GPU RAM: 0.01 G/11.91 G] + 24 Aug 01:47 INFO best valid : {'recall@10': 0.2195, 'mrr@10': 0.3871, 'ndcg@10': 0.2344, 'hit@10': 0.7582, 'precision@10': 0.1627} + 24 Aug 01:47 INFO test result: {'recall@10': 0.2523, 'mrr@10': 0.4855, 'ndcg@10': 0.292, 'hit@10': 0.7953, 'precision@10': 0.1962} + +Finally you will get the model's performance on the test set and the model file will be saved under the `/save`. Besides, +RecBole allows tracking and visualizing train loss and valid score with TensorBoard, please read the .doc.`../user_guide/usage/use_tensorboard` for more details. + +The above is the whole process of running a model in RecBole, and you can read other docs for depth usage. - run_recbole() +Quick-start From Source +-------------------------- +Besides using API, you can also directly run the source code of `RecBole `_. +The whole process is similar to Quick-start From API. +You can create a `yaml` file called `test.yaml` and set all the config as follow: + +.. code:: yaml + + # dataset config + USER_ID_FIELD: user_id + ITEM_ID_FIELD: item_id + load_col: + inter: [user_id, item_id] + + # model config + embedding_size: 64 + + # Training and evaluation config + epochs: 500 + train_batch_size: 4096 + eval_batch_size: 4096 + neg_sampling: + uniform: 1 + eval_args: + group_by: user + order: RO + split: {'RS': [0.8,0.1,0.1]} + mode: full + metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision'] + topk: 10 + valid_metric: MRR@10 + metric_decimal_place: 4 Then run the following command: .. code:: bash - python run.py --dataset=ml-100k --model=BPR + python run_recbole.py --model=BPR --dataset=ml-100k --config_files=test.yaml + +And you will get the output of running the BPR model on the ml-100k dataset. + +If you want to change the parameters, such as ``embedding_size``, +just set the additional command parameters as you need: + +.. code:: bash -This will perform the training and test of the BPR model on the ml-100k dataset. + python run_recbole.py --model=BPR --dataset=ml-100k --config_files=test.yaml --embedding_size=0.0001 -One can also use similar methods as mentioned above to run different models, parameters or datasets, -the operations are same with `Quick-start From Source`_. In-depth Usage @@ -116,5 +187,5 @@ For a more in-depth usage about RecBole, take a look at - :doc:`../user_guide/config_settings` - :doc:`../user_guide/data_intro` - :doc:`../user_guide/model_intro` -- :doc:`../user_guide/evaluation_support` +- :doc:`../user_guide/train_eval_intro` - :doc:`../user_guide/usage` diff --git a/docs/source/index.rst b/docs/source/index.rst index 91953e455..1b44669c4 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,15 +1,50 @@ .. RecBole documentation master file. - -RecBole v0.2.0 +.. title:: RecBole v1.0.0 +.. image:: asset/logo.png ========================================================= `HomePage `_ | `Docs `_ | `GitHub `_ | `Datasets `_ | `v0.1.2 `_ +Introduction +------------------------- +RecBole is a unified, comprehensive and efficient framework developed based on PyTorch. +It aims to help the researchers to reproduce and develop recommendation models. + +In the lastest release, our library includes 73 recommendation algorithms `[Model List]`_, covering four major categories: + +- General Recommendation +- Sequential Recommendation +- Context-aware Recommendation +- Knowledge-based Recommendation + +We design a unified and flexible data file format, and provide the support for 28 benchmark recommendation datasets `[Collected Datasets]`_. A user can apply the provided script to process the original data copy, or simply download the processed datasets by our team. + +.. image:: asset/framework.png + :width: 600 + :align: center + +Features: + +- General and extensible data structure + We deign general and extensible data structures to unify the formatting and usage of various recommendation datasets. +- Comprehensive benchmark models and datasets + We implement 73 commonly used recommendation algorithms, and provide the formatted copies of 28 recommendation datasets. +- Efficient GPU-accelerated execution + We design many tailored strategies in the GPU environment to enhance the efficiency of our library. +- Extensive and standard evaluation protocols + We support a series of commonly used evaluation protocols or settings for testing and comparing recommendation algorithms. + +.. _[Collected Datasets]: + /dataset_list.html + +.. _[Model List]: + /model_list.html + + .. toctree:: :maxdepth: 1 :caption: Get Started - get_started/introduction get_started/install get_started/quick_start @@ -20,7 +55,7 @@ RecBole v0.2.0 user_guide/config_settings user_guide/data_intro user_guide/model_intro - user_guide/evaluation_support + user_guide/train_eval_intro user_guide/usage @@ -48,13 +83,21 @@ RecBole v0.2.0 recbole/recbole.trainer.hyper_tuning recbole/recbole.trainer.trainer recbole/recbole.utils.case_study - recbole/recbole.utils.enum_type - recbole/recbole.utils.logger recbole/recbole.utils.utils +The Team +------------------ +RecBole is developed and maintained by `RUC, BUPT, ECNU `_. + +Here is the list of our lead developers in each development phase. They are the souls of RecBole and have made outstanding contributions. -Indices and tables -================== +====================== =============== ============================================= +Time Version Lead Developers +====================== =============== ============================================= +June 2020 ~ Nov. 2020 v0.1.1 `Shanlei Mu `_, `Yupeng Hou `_, `Zihan Lin `_, `Kaiyuan Li `_ +Nov. 2020 ~ Now v0.1.2 ~ v1.0.0 `Yushuo Chen `_, `Xingyu Pan `_ +====================== =============== ============================================= -* :ref:`genindex` -* :ref:`search` +License +------------ +RecBole uses `MIT License `_. \ No newline at end of file diff --git a/docs/source/user_guide/data/data_args.rst b/docs/source/user_guide/config/data_settings.rst similarity index 99% rename from docs/source/user_guide/data/data_args.rst rename to docs/source/user_guide/config/data_settings.rst index 97ce58835..156982f4c 100644 --- a/docs/source/user_guide/data/data_args.rst +++ b/docs/source/user_guide/config/data_settings.rst @@ -1,4 +1,4 @@ -Args for Data +Data settings ========================= RecBole provides several arguments for describing: diff --git a/docs/source/user_guide/config/environment_settings.rst b/docs/source/user_guide/config/environment_settings.rst new file mode 100644 index 000000000..dc4bb07b8 --- /dev/null +++ b/docs/source/user_guide/config/environment_settings.rst @@ -0,0 +1,26 @@ +Environment settings +=========================== +Environment settings are designed to set basic parameters of running environment. + +- ``gpu_id (int or str)`` : The id of GPU device. Defaults to ``0``. +- ``use_gpu (bool)`` : Whether or not to use GPU. If True, using GPU, else using CPU. + Defaults to ``True``. +- ``seed (int)`` : Random seed. Defaults to ``2020``. +- ``state (str)`` : Logging level. Defaults to ``'INFO'``. + Range in ``['INFO', 'DEBUG', 'WARNING', 'ERROR', 'CRITICAL']``. +- ``reproducibility (bool)`` : If True, the tool will use deterministic + convolution algorithms, which makes the result reproducible. If False, + the tool will benchmark multiple convolution algorithms and select the fastest one, + which makes the result not reproducible but can speed up model training in + some case. Defaults to ``True``. +- ``data_path (str)`` : The path of input dataset. Defaults to ``'dataset/'``. +- ``checkpoint_dir (str)`` : The path to save checkpoint file. + Defaults to ``'saved/'``. +- ``show_progress (bool)`` : Show the progress of training epoch and evaluate epoch. + Defaults to ``True``. +- ``save_dataset (bool)``: Whether or not save filtered dataset. + If True, save filtered dataset, otherwise it will not be saved. + Defaults to ``False``. +- ``save_dataloaders (bool)``: Whether or not save split dataloaders. + If True, save split dataloaders, otherwise they will not be saved. + Defaults to ``False``. diff --git a/docs/source/user_guide/config/evaluation_settings.rst b/docs/source/user_guide/config/evaluation_settings.rst new file mode 100644 index 000000000..42fffc633 --- /dev/null +++ b/docs/source/user_guide/config/evaluation_settings.rst @@ -0,0 +1,34 @@ +Evaluation Settings +=========================== +Evaluation settings are designed to set parameters about model evaluation. + + + +- ``eval_args (dict)``: This parameter have 4 keys: ``group_by``, ``order``, ``split``, and ``mode``, + which respectively control the data grouping strategy, data ordering strategy, data splitting strategy + and evaluation mode for model evaluation. + + - ``group_by (str)``: decides how we group the data in `.inter`. Now we support two kinds of grouping strategies: ``['user', 'none']``. If the value of ``group_by`` is ``user``, the data will be grouped by the column of `USER_ID_FIELD ` and split in user dimension. If the value is `none`, the data won't be grouped. The default value is ``user``. + + - ``order (str)``: decides how we sort the data in `.inter`. Now we support two kinds of ordering strategies: ``['RO', 'TO']``, which denotes the random ordering and temporal ordering. For ``RO``, we will shuffle the data and then split them in this order. For ``TO``, we will sort the data by the column of `TIME_FIELD` in ascending order and the split them in this order. The default value is `RO`. + + - ``split (dict)``: decides how we split the data in `.inter`. Now we support two kinds of splitting strategies: ``['RS','LS']``, which denotes the ratio-based data splitting and leave-one-out data splitting. If the key of ``split`` is ``RS``, you need to set the splitting ratio like ``[0.8,0.1,0.1]``,``[7,2,1]`` or ``[8,0,2]``, which denotes the ratio of training set, validation set and testing set respectively. If the key of split is ``LS``, now we support three kinds of ``LS`` mode: ``['valid_and_test', 'valid_only', 'test_only']`` and you should choose one mode as the value of `LS`. The default value of `split` is ``{'RS': [0.8,0.1,0.1]}``. + + - ``mode (str)``: decides the data range which we evaluate the model on. Now we support four kinds of evaluation mode: ``['full','unixxx','popxxx','labeled']``. ``full`` , ``unixxx`` and ``popxxx`` are designed for the evaluation on implicit feedback (data without label). For implicit feedback, we regard the items with observed interactions as positive items and those without observed interactions as negative items. ``full`` means evaluating the model on the set of all items. ``unixxx``, for example ``uni100``, means uniformly sample 100 negative items for each positive item in testing set, and evaluate the model on these positive items with their sampled negative items. ``popxxx``, for example ``pop100``, means sample 100 negative items for each positive item in testing set based on item popularity (:obj:`Counter(item)` in `.inter` file), and evaluate the model on these positive items with their sampled negative items. Here the `xxx` must be an integer. For explicit feedback (data with label), you should set the mode as ``None`` and we will evaluate the model based on your label. The default value is ``full``. + +- ``repeatable (bool)``: Whether to evaluate the result with a repeatable recommendation scene. + Note that it is disabled for sequential models as the recommendation is already repeatable. + For other models, defaults to ``False``. +- ``metrics (list or str)``: Evaluation metrics. Defaults to + ``['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']``. Range in + ``[''Recall', 'MRR', 'NDCG', 'Hit', 'MAP', 'Precision', 'AUC', + 'MAE', 'RMSE', 'LogLoss', 'ItemCoverage', 'AveragePopularity', + 'GiniIndex','ShannonEntropy','TailPercentage' ]``. Note that value-based + metrics and ranking-based metrics can not be used together. +- ``topk (list or int or None)``: The value of k for topk evaluation metrics. + Defaults to ``10``. +- ``valid_metric (str)``: The evaluation metrics for early stopping. + It must be one of used ``metrics``. Defaults to ``'MRR@10'``. +- ``eval_batch_size (int)``: The evaluation batch size. Defaults to ``4096``. +- ``metric_decimal_place(int)``: The decimal place of metric score. Defaults to ``4``. + diff --git a/docs/source/user_guide/config/parameters_configuration.rst b/docs/source/user_guide/config/parameters_configuration.rst new file mode 100644 index 000000000..a0c57411d --- /dev/null +++ b/docs/source/user_guide/config/parameters_configuration.rst @@ -0,0 +1,149 @@ +Parameters Configuration +------------------------------ +RecBole supports three types of parameter configurations: Config files, +Parameter Dicts and Command Line. The parameters are assigned via the +Configuration module. + +Config Files +^^^^^^^^^^^^^^^^ +Config Files should be organized in the format of yaml. +The users should write their parameters according to the rules aligned with +yaml, and the final config files are processed by the configuration module +to complete the parameter settings. + +To begin with, we write the parameters into the yaml files (e.g. `example.yaml`). + +.. code:: yaml + + gpu_id: 1 + training_batch_size: 1024 + +Then, the yaml files are conveyed to the configuration module to finish the +parameter settings. + +.. code:: python + + from recbole.config import Config + + config = Config(model='BPR', dataset='ml-100k', config_file_list=['example.yaml']) + print('gpu_id: ', config['gpu_id']) + print('training_batch_size: ', config['training_batch_size']) + + +output: + +.. code:: bash + + gpu_id: 1 + training_batch_size: 1024 + +The parameter ``config_file_list`` supports multiple yaml files. + +For more details on yaml, please refer to YAML_. + +.. _YAML: https://yaml.org/ + +When using our toolkit, the parameters belonging to **Dataset parameters** and +Evaluation Settings of **Basic Parameters** are recommended to be written into +the config files, which may be convenient for reusing the configurations. + +Parameter Dicts +^^^^^^^^^^^^^^^^^^ +Parameter Dict is realized by the dict data structure in python, where the key +is the parameter name, and the value is the parameter value. The users can write their +parameters into a dict, and input it into the configuration module. + +An example is as follows: + +.. code:: python + + from recbole.config import Config + + parameter_dict = { + 'gpu_id': 2, + 'training_batch_size': 512 + } + config = Config(model='BPR', dataset='ml-100k', config_dict=parameter_dict) + print('gpu_id: ', config['gpu_id']) + print('training_batch_size: ', config['training_batch_size']) + +output: + +.. code:: bash + + gpu_id: 2 + training_batch_size: 512 + + +Command Line +^^^^^^^^^^^^^^^^^^^^^^^^ +We can also assign parameters based on the command line. +The parameters in the command line can be read from the configuration module. +The format is: `-–parameter_name=[parameter_value]`. + +Write the following code to the python file (e.g. `run.py`): + +.. code:: python + + from recbole.config import Config + + config = Config(model='BPR', dataset='ml-100k') + print('gpu_id: ', config['gpu_id']) + print('training_batch_size: ', config['training_batch_size']) + +Running: + +.. code:: bash + + python run.py --gpu_id=3 --training_batch_size=256 + +output: + +.. code:: bash + + gpu_id: 3 + training_batch_size: 256 + + +Priority +^^^^^^^^^^^^^^^^^ +RecBole supports the combination of three types of parameter configurations. + +The priority of the configuration methods is: Command Line > Parameter Dicts +> Config Files > Default Settings + +A example is as follows: + +`example.yaml`: + +.. code:: yaml + + gpu_id: 1 + training_batch_size: 1024 + +`run.py`: + +.. code:: python + + from recbole.config import Config + + parameter_dict = { + 'gpu_id': 2, + 'training_batch_size': 512 + } + config = Config(model='BPR', dataset='ml-100k', config_file_list=['example.yaml'], config_dict=parameter_dict) + print('gpu_id: ', config['gpu_id']) + print('training_batch_size: ', config['training_batch_size']) + +Running: + +.. code:: bash + + python run.py --gpu_id=3 --training_batch_size=256 + +output: + +.. code:: bash + + gpu_id: 3 + training_batch_size: 256 diff --git a/docs/source/user_guide/config/training_settings.rst b/docs/source/user_guide/config/training_settings.rst new file mode 100644 index 000000000..f95f44e2b --- /dev/null +++ b/docs/source/user_guide/config/training_settings.rst @@ -0,0 +1,24 @@ +Training Settings +=========================== +Training settings are designed to set parameters about model training. + + +- ``epochs (int)`` : The number of training epochs. Defaults to ``300``. +- ``train_batch_size (int)`` : The training batch size. Defaults to ``2048``. +- ``learner (str)`` : The name of used optimizer. Defaults to ``'adam'``. + Range in ``['adam', 'sgd', 'adagrad', 'rmsprop', 'sparse_adam']``. +- ``learning_rate (float)`` : Learning rate. Defaults to ``0.001``. +- ``neg_sampling(dict)``: This parameter control the negative sampling for model training. + The key range is ``['uniform', 'popularity']``, which decides the distribution of negative item in sampling pools. + ``uniform`` means uniformly select the negative items while ``popularity`` means select the negative item based on + their popularity (Counter(item) in `.inter` file). Note that if your data is labeled, you need to set this parameter as ``None``. + The default value of this parameter is ``{'uniform': 1}``. +- ``eval_step (int)`` : The number of training epochs before a evaluation + on the valid dataset. If it is less than 1, the model will not be + evaluated on the valid dataset. Defaults to ``1``. +- ``stopping_step (int)`` : The threshold for validation-based early stopping. + Defaults to ``10``. +- ``clip_grad_norm (dict)`` : The args of `clip_grad_norm_ `_ + which will clips gradient norm of model. Defaults to ``None``. +- ``loss_decimal_place(int)``: The decimal place of training loss. Defaults to ``4``. +- ``weight_decay (float)`` : Weight decay (L2 penalty), used for `optimizer `_. Default to ``0.0``. \ No newline at end of file diff --git a/docs/source/user_guide/config_settings.rst b/docs/source/user_guide/config_settings.rst index 6d268334c..bb317f161 100644 --- a/docs/source/user_guide/config_settings.rst +++ b/docs/source/user_guide/config_settings.rst @@ -1,259 +1,32 @@ -Config Settings +Config Introduction =================== RecBole is able to config different parameters for controlling the experiment setup (e.g., data processing, data splitting, training and evaluation). The users can select the settings according to their own requirements. -The introduction of different parameter configurations are presented as follows: - -Parameters Introduction +Config settings ----------------------------- -The parameters in RecBole can be divided into three categories: -Basic Parameters, Dataset Parameters and Model Parameters. - -Basic Parameters -^^^^^^^^^^^^^^^^^^^^^^ -Basic parameters are used to build the general environment including the settings for -model training and evaluation. - -**Environment Setting** - -- ``gpu_id (int or str)`` : The id of GPU device. Defaults to ``0``. -- ``use_gpu (bool)`` : Whether or not to use GPU. If True, using GPU, else using CPU. - Defaults to ``True``. -- ``seed (int)`` : Random seed. Defaults to ``2020``. -- ``state (str)`` : Logging level. Defaults to ``'INFO'``. - Range in ``['INFO', 'DEBUG', 'WARNING', 'ERROR', 'CRITICAL']``. -- ``reproducibility (bool)`` : If True, the tool will use deterministic - convolution algorithms, which makes the result reproducible. If False, - the tool will benchmark multiple convolution algorithms and select the fastest one, - which makes the result not reproducible but can speed up model training in - some case. Defaults to ``True``. -- ``data_path (str)`` : The path of input dataset. Defaults to ``'dataset/'``. -- ``checkpoint_dir (str)`` : The path to save checkpoint file. - Defaults to ``'saved/'``. -- ``show_progress (bool)`` : Show the progress of training epoch and evaluate epoch. - Defaults to ``True``. -- ``save_dataset (bool)``: Whether or not save filtered dataset. - If True, save filtered dataset, otherwise it will not be saved. - Defaults to ``False``. -- ``save_dataloaders (bool)``: Whether or not save split dataloaders. - If True, save split dataloaders, otherwise they will not be saved. - Defaults to ``False``. - -**Training Setting** - -- ``epochs (int)`` : The number of training epochs. Defaults to ``300``. -- ``train_batch_size (int)`` : The training batch size. Defaults to ``2048``. -- ``learner (str)`` : The name of used optimizer. Defaults to ``'adam'``. - Range in ``['adam', 'sgd', 'adagrad', 'rmsprop', 'sparse_adam']``. -- ``learning_rate (float)`` : Learning rate. Defaults to ``0.001``. -- ``training_neg_sample_num (int)`` : The number of negative samples during - training. If it is set to 0, the negative sampling operation will not be - performed. Defaults to ``1``. -- ``training_neg_sample_distribution(str)`` : Distribution of the negative items - in training phase. Default to ``uniform``. Range in ``['uniform', 'popularity']``. -- ``eval_step (int)`` : The number of training epochs before a evaluation - on the valid dataset. If it is less than 1, the model will not be - evaluated on the valid dataset. Defaults to ``1``. -- ``stopping_step (int)`` : The threshold for validation-based early stopping. - Defaults to ``10``. -- ``clip_grad_norm (dict)`` : The args of `clip_grad_norm_ `_ - which will clips gradient norm of model. Defaults to ``None``. -- ``loss_decimal_place(int)``: The decimal place of training loss. Defaults to ``4``. -- ``weight_decay (float)`` : Weight decay (L2 penalty), used for `optimizer `_. Default to ``0.0``. - - - -**Evaluation Setting** - -- ``eval_setting (str)``: The evaluation settings. Defaults to ``'RO_RS,full'``. - The parameter has two parts. The first part control the splitting methods, - the range is ``['RO_RS','TO_LS','RO_LS','TO_RS']``. The second part(optional) - control the ranking mechanism, the range is ``['full','uni100','uni1000','pop100','pop1000']``. -- ``group_by_user (bool)``: Whether or not to group the users. - It must be ``True`` when ``eval_setting`` is in ``['RO_LS', 'TO_LS']``. - Defaults to ``True``. -- ``split_ratio (list)``: The split ratio between train data, valid data and - test data. It only take effects when the first part of ``eval_setting`` - is in ``['RO_RS', 'TO_RS']``. Defaults to ``[0.8, 0.1, 0.1]``. -- ``leave_one_num (int)``: It only take effects when the first part of - ``eval_setting`` is in ``['RO_LS', 'TO_LS']``. Defaults to ``2``. - -- ``metrics (list or str)``: Evaluation metrics. Defaults to - ``['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']``. Range in - ``['Recall', 'MRR', 'NDCG', 'Hit', 'MAP', 'Precision', 'AUC', 'GAUC', - 'MAE', 'RMSE', 'LogLoss']``. -- ``topk (list or int or None)``: The value of k for topk evaluation metrics. - Defaults to ``10``. -- ``valid_metric (str)``: The evaluation metrics for early stopping. - It must be one of used ``metrics``. Defaults to ``'MRR@10'``. -- ``eval_batch_size (int)``: The evaluation batch size. Defaults to ``4096``. -- ``metric_decimal_place(int)``: The decimal place of metric score. Defaults to ``4``. - -Pleaser refer to :doc:`evaluation_support` for more details about the parameters -in Evaluation Setting. - -Dataset Parameters -^^^^^^^^^^^^^^^^^^^^^^^ -Dataset Parameters are used to describe the dataset information and control -the dataset loading and filtering. - -Please refer to :doc:`data/data_args` for more details. - -Model Parameters -^^^^^^^^^^^^^^^^^^^^^ -Model Parameters are used to describe the model structures. - -Please refer to :doc:`model_intro` for more details. - - -Parameters Configuration ------------------------------- -RecBole supports three types of parameter configurations: Config files, -Parameter Dicts and Command Line. The parameters are assigned via the -Configuration module. - -Config Files -^^^^^^^^^^^^^^^^ -Config Files should be organized in the format of yaml. -The users should write their parameters according to the rules aligned with -yaml, and the final config files are processed by the configuration module -to complete the parameter settings. - -To begin with, we write the parameters into the yaml files (e.g. `example.yaml`). - -.. code:: yaml - - gpu_id: 1 - training_batch_size: 1024 - -Then, the yaml files are conveyed to the configuration module to finish the -parameter settings. - -.. code:: python - - from recbole.config import Config - - config = Config(model='BPR', dataset='ml-100k', config_file_list=['example.yaml']) - print('gpu_id: ', config['gpu_id']) - print('training_batch_size: ', config['training_batch_size']) - - -output: - -.. code:: bash - - gpu_id: 1 - training_batch_size: 1024 - -The parameter ``config_file_list`` supports multiple yaml files. - -For more details on yaml, please refer to YAML_. +We split all the config settings into five parts: environment settings, data settings, model settings, training settings and evaluation settings. +The introduction of different parameter configurations are presented as follows (for model settings, please read the specific model page in :doc:`model_intro`): -.. _YAML: https://yaml.org/ +.. toctree:: + :maxdepth: 1 -When using our toolkit, the parameters belonging to **Dataset parameters** and -Evaluation Settings of **Basic Parameters** are recommended to be written into -the config files, which may be convenient for reusing the configurations. + config/environment_settings + config/data_settings + config/training_settings + config/evaluation_settings -Parameter Dicts -^^^^^^^^^^^^^^^^^^ -Parameter Dict is realized by the dict data structure in python, where the key -is the parameter name, and the value is the parameter value. The users can write their -parameters into a dict, and input it into the configuration module. - -An example is as follows: - -.. code:: python - - from recbole.config import Config - - parameter_dict = { - 'gpu_id': 2, - 'training_batch_size': 512 - } - config = Config(model='BPR', dataset='ml-100k', config_dict=parameter_dict) - print('gpu_id: ', config['gpu_id']) - print('training_batch_size: ', config['training_batch_size']) - -output: - -.. code:: bash - - gpu_id: 2 - training_batch_size: 512 - - -Command Line -^^^^^^^^^^^^^^^^^^^^^^^^ -We can also assign parameters based on the command line. -The parameters in the command line can be read from the configuration module. -The format is: `-–parameter_name=[parameter_value]`. - -Write the following code to the python file (e.g. `run.py`): - -.. code:: python - - from recbole.config import Config - - config = Config(model='BPR', dataset='ml-100k') - print('gpu_id: ', config['gpu_id']) - print('training_batch_size: ', config['training_batch_size']) - -Running: - -.. code:: bash - - python run.py --gpu_id=3 --training_batch_size=256 - -output: - -.. code:: bash - - gpu_id: 3 - training_batch_size: 256 - - -Priority -^^^^^^^^^^^^^^^^^ -RecBole supports the combination of three types of parameter configurations. - -The priority of the configuration methods is: Command Line > Parameter Dicts -> Config Files > Default Settings - -A example is as follows: - -`example.yaml`: - -.. code:: yaml - - gpu_id: 1 - training_batch_size: 1024 - -`run.py`: - -.. code:: python - - from recbole.config import Config - - parameter_dict = { - 'gpu_id': 2, - 'training_batch_size': 512 - } - config = Config(model='BPR', dataset='ml-100k', config_file_list=['example.yaml'], config_dict=parameter_dict) - print('gpu_id: ', config['gpu_id']) - print('training_batch_size: ', config['training_batch_size']) - -Running: +How to set config? +----------------------------- +RecBole supports three types of parameter configurations: Config files, Parameter Dicts and Command Line. +The parameters are assigned via the Configuration module. -.. code:: bash +For more details about setting config, please read - python run.py --gpu_id=3 --training_batch_size=256 +.. toctree:: + :maxdepth: 1 -output: + config/parameters_configuration -.. code:: bash - gpu_id: 3 - training_batch_size: 256 diff --git a/docs/source/user_guide/data/dataset_download.rst b/docs/source/user_guide/data/dataset_download.rst new file mode 100644 index 000000000..01672dbdd --- /dev/null +++ b/docs/source/user_guide/data/dataset_download.rst @@ -0,0 +1,39 @@ +Dataset Download +================================ + +In RecBole, we have collected and released 28 commonly-used publiced dataset (detailed as `Dataset List `_). +Users can freely download these datasets in the following three ways: + +1. Automatically downloading +----------------------------- +For the convenience of users, we implement automatically downloading module in RecBole and now we support to download the :doc:`atomic_files` of 28 commonly-used +publiced datasets (detailed as `Dataset List `_). If you want to run models on a dataset, you just need to set the +`dataset` and then the data files will be automatically downloaded. + +For example, if you want to run BPR model on the ml-1m dataset but you don't prepare the the :doc:`atomic_files` of ml-1m dataset, +you can use our automatically downloading module to download the data. +All you need is run the model as normal, and RecBole will automatically check if you have the data files, if not, it will begin to download the data files +and you will get the output like this: + +.. code:: none + + 23 Aug 10:02 INFO Prepare to download dataset [ml-1m] from [https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/MovieLens/ml-1m.zip]. + 23 Aug 10:02 INFO Downloading https://recbole.s3-accelerate.amazonaws.com/ProcessedDatasets/MovieLens/ml-1m.zip + Downloaded 0.01 GB: 100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [04:16<00:00, 36.65s/it] + 23 Aug 10:06 INFO Extracting dataset/ml-1m/ml-1m.zip + 23 Aug 10:06 INFO Downloading done. + +And next time you can directly run other models on ml-1m dataset. + +2. Download from cloud disk +----------------------------- +Besides automatically downloading, we also upload our collected and converted atomic files of 28 datasets in `Google Drive `_ and `Baidu Wangpan `_ (Password: e272). +You can also download the data from these two resources by yourself. + +3. Covert the raw data +----------------------------- +If you have already download the raw data, you can also covert them into atomic files format by yourself. +And we have already publiced some converting scripts in `RecDatasets _`. + + + diff --git a/docs/source/user_guide/data/label_of_data.rst b/docs/source/user_guide/data/label_of_data.rst new file mode 100644 index 000000000..c33b2795f --- /dev/null +++ b/docs/source/user_guide/data/label_of_data.rst @@ -0,0 +1,90 @@ +Label of data +========================= +In recommendation filed, there are two kinds of data scenes: explicit feedback scene and implicit feedback scene. + +Explicit feedback, like rating for items, has explicit label for model training. While for implicit feedback, like clicks and purchases, +the label of data is vague, and generally we will regard all the observed interaction as the positive samples and select negative samples from +unobserved interactions (known as negative sampling). + +To supports both explicit feedback scene and implicit feedback scene, RecBole design three ways to set label of data. + +1. Set label field +----------------------------- +If your data has already been labeled, you only need to set ``LABEL_FIELD`` to tell the model +which column represents the label of data, and then set `neg_sampling` as `None`. + +For example, if your `.inter` file is like: + +============= ============= ============ =============== +user_id:token item_id:token label:float timestamp:float +============= ============= ============ =============== +1 1193 1 978300760 +1 661 0 978302109 +2 11 1 978302009 +2 112 1 978312344 +2 555 0 978302321 +3 234 1 978302109 +============= ============= ============ =============== + +Then, you can set the config like: + +.. code:: yaml + + LABEL_FIELD: label + neg_sampling: None + +Note that the value of your label column should only be 0 or 1 (0 represents the negative label and +1 represents the positive label). + +2. Set threshold +------------------------------ + +If your data doesn't have labels but has users'feedback information (like rating for items) to show the their preferences, +a general way to label them is to set threshold. + +For example, if you `.inter` file is like: + +============= ============= ============ =============== +user_id:token item_id:token rating:float timestamp:float +============= ============= ============ =============== +1 1193 5 978300760 +1 661 1 978302109 +2 11 4 978302009 +2 112 4 978312344 +2 555 1 978302321 +3 234 3 978302109 +============= ============= ============ =============== + +To set label for these interactions, you can set `3` as the threshold of rating, and +the interactions will be labeled as positive if their rating no less than 3. + +You can set the config like: + +.. code:: yaml + + threshold: + rating: 3 + neg_sampling: None + +And then RecBole will automatically set label for each interactions based on their rating column. + +3. Negative sampling +------------------------------ +If your only have implicit feedback data, without label or users' feedback information. +A general way to label these kind of data is negative sampling. We will assume that for each user, all the observed interactions are positive, +and the unobserved ones are negative. And then, we will set positive label for all the observed interactions, +and select some negative samples from the unobserved interactions according to a certain strategy. + +You can set the config like: + +.. code:: yaml + + neg_sampling: + uniform: 1 + +And then, RecBole will automatically select one negative sample for each positive sample uniformly from the unobserved interactions. + +At last, for more details about the label config, please read :doc:`../config/data_settings` and :doc:`../config/training_settings`. + + + diff --git a/docs/source/user_guide/data_intro.rst b/docs/source/user_guide/data_intro.rst index f84d2c04e..84b693d68 100644 --- a/docs/source/user_guide/data_intro.rst +++ b/docs/source/user_guide/data_intro.rst @@ -1,12 +1,31 @@ -Data Introduction +Data Module Introduction =================== -Here we introduce the whole dataflow and highlight its key features. +RecBole not only implements lots of popular recommender models, but also collects and releases 28 commonly-used publiced datasets. +You can freely download these datasets following our docs :doc:`data/dataset_download`. + +For extensibility and reusability, Recbole has a flexible and extensible data module. +our data module designs an elegant data flow that transforms raw data +into the model input. Detailed as :doc:`data/data_flow`. +In order to characterize most forms of the input data +required by different recommendation tasks, RecBole designs an input data format called :doc:`data/atomic_files`. All the input data should be +convert into `Atomic Files` format. +Besides, we design a data structure called :doc:`data/interaction` to provides a unified internal data representation for different +recommendation algorithms. + +Plus, RecBole supports both explicit feedback (labeled data) scenes and implicit feedback (unlabeled data) scenes. For explicit feedback scenes, +users can set the `LABEL_FIELD` in the config and RecBole will train and test model based on the label. For implicit feedback scenes, RecBole will +regard all the observed interactions as positive samples and automatically select the negative samples from the unobserved interactions (which is known as negative sampling). +For more information about label setting in RecBole, +please read :doc:`data/label_of_data`. + +Here are the related docs for data module: .. toctree:: :maxdepth: 1 + data/dataset_download data/data_flow data/atomic_files data/interaction - data/data_args + data/label_of_data \ No newline at end of file diff --git a/docs/source/user_guide/evaluation_support.rst b/docs/source/user_guide/evaluation_support.rst deleted file mode 100644 index 39cc2167c..000000000 --- a/docs/source/user_guide/evaluation_support.rst +++ /dev/null @@ -1,65 +0,0 @@ -Evaluation Support -=========================== - -The function of evaluation module is to implement commonly used evaluation -protocols for recommender systems. Since different models can be compared under -the same evaluation modules, RecBole standardizes the evaluation of recommender -systems. - - -Evaluation Settings ------------------------ -The evaluation settings supported by RecBole is as following. Among them, the -first four rows correspond to the dataset splitting methods, while the last two -rows correspond to the ranking mechanism, namely a full ranking over all the -items or a sampled-based ranking. - -================== ======================================================== - Notation Explanation -================== ======================================================== - RO_RS Random Ordering + Ratio-based Splitting - TO_LS Temporal Ordering + Leave-one-out Splitting - RO_LS Random Ordering + Leave-one-out Splitting - TO_RS Temporal Ordering + Ratio-based Splitting - full full ranking with all item candidates - uniN sample-based ranking: each positive item is paired with N sampled negative items in uniform distribution - popN sample-based ranking: each positive item is paired with N sampled negative items in popularity distribution -================== ======================================================== - -The parameters used to control the evaluation settings are as follows: - -- ``eval_setting (str)``: The evaluation settings. Defaults to ``'RO_RS,full'``. - The parameter has two parts. The first part control the splitting methods, - range in ``['RO_RS','TO_LS','RO_LS','TO_RS']``. The second part(optional) - control the ranking mechanism, range in ``['full','uni100','uni1000','pop100','pop1000']``. -- ``group_by_user (bool)``: Whether the users are grouped. - It must be ``True`` when ``eval_setting`` is in ``['RO_LS', 'TO_LS']``. - Defaults to ``True``. -- ``spilt_ratio (list)``: The split ratio between train data, valid data and - test data. It only take effects when the first part of ``eval_setting`` - is in ``['RO_RS', 'TO_RS']``. Defaults to ``[0.8, 0.1, 0.1]``. -- ``leave_one_num (int)``: It only take effects when the first part of - ``eval_setting`` is in ``['RO_LS', 'TO_LS']``. Defaults to ``2``. - -Evaluation Metrics ------------------------ - -RecBole supports both value-based and ranking-based evaluation metrics. - -The value-based metrics (i.e., for rating prediction) include ``RMSE``, ``MAE``, -``AUC`` and ``LogLoss``, measuring the prediction difference between the true -and predicted values. - -The ranking-based metrics (i.e., for top-k item recommendation) include the most -common ranking-aware metrics, such as ``Recall``, ``Precision``, ``Hit``, -``NDCG``, ``MAP`` and ``MRR``, measuring the ranking performance of the -generated recommendation lists by an algorithm. - -The parameters used to control the evaluation metrics are as follows: - -- ``metrics (list or str)``: Evaluation metrics. Defaults to - ``['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']``. Range in - ``['Recall', 'MRR', 'NDCG', 'Hit', 'MAP', 'Precision', 'AUC', - 'MAE', 'RMSE', 'LogLoss']``. -- ``topk (list or int or None)``: The value of k for topk evaluation metrics. - Defaults to ``10``. diff --git a/docs/source/user_guide/model_intro.rst b/docs/source/user_guide/model_intro.rst index 45cc8550d..dfe3ca5f9 100644 --- a/docs/source/user_guide/model_intro.rst +++ b/docs/source/user_guide/model_intro.rst @@ -107,7 +107,6 @@ Knowledge-based Recommendation --------------------------------- Knowledge-based recommendation introduce an external knowledge graph to enhance general or sequential recommendation. - .. toctree:: :maxdepth: 1 diff --git a/docs/source/user_guide/train_eval_intro.rst b/docs/source/user_guide/train_eval_intro.rst new file mode 100644 index 000000000..caf7aff3a --- /dev/null +++ b/docs/source/user_guide/train_eval_intro.rst @@ -0,0 +1,97 @@ +Training & Evaluation Introduction +=========================== + +Training introduction +----------------------- +Multiple training strategies are supported by Recbole. For traditional CPU-based +collaborative filter models, non-gradient training is naturally applied. For +main-stream neural-based models, automatic gradient descent is well equipped +and set as default training strategy. Also two-stage training strategy is prepared +for pretraining-based models. In addition, users who need a unusual training strategy +can customize the ``Trainer`` and please refer to :doc:`../developer_guide/customize_trainers` +for more details. + +Apart from flexible training strategies, an automatic hyper-parameter searching is +also supported. The implement of searching is fully based on `hyperopt `_ +Users can set the range of hyper-parameters in an config file with format of hyperopt +and the optimal hyper-parameter and result will be output. +You can read :doc:`usage/parameter_tuning` for more information about hyper-parameter-tuning in RecBole. + +To control the training method, we design a series of training parameters in config, +and you can read the :doc:`config/training_settings` for more information. + + +Evaluation introduction +----------------------- +The function of evaluation module is to implement commonly used evaluation +protocols for recommender systems. Since different models can be compared under +the same evaluation modules, RecBole standardizes the evaluation of recommender +systems. + +Evaluation method +>>>>>>>>>>>>>>>>>>>>>>> + +The evaluation method supported by RecBole is as following. Among them, the +first four rows correspond to the dataset splitting methods, while the last two +rows correspond to the ranking mechanism, namely a full ranking over all the +items or a sampled-based ranking. + +================== ======================================================== + Notation Explanation +================== ======================================================== + RO Random Ordering + TO Temporal Ordering + LS Leave-one-out Splitting + RS Ratio-based Splitting + full full ranking with all item candidates + uniN sample-based ranking: each positive item is paired with N sampled negative items in uniform distribution + popN sample-based ranking: each positive item is paired with N sampled negative items in popularity distribution +================== ======================================================== + +The parameters used to control the evaluation method are as follows: + +- ``eval_args (dict)``: The overall evaluation settings. It contains all the setting of evaluation + including ``split``, ``group_by``, ``order`` and ``mode``. + + - ``split (dict)``: Control the splitting of dataset and the split ratio. The key is splitting method + and value is the list of split ratio. The range of key is ``[RO,TO]``. Defaults to ``{'RO':[0.8, 0.1, 0.1]}`` + - ``group_by (str)``: Whether to split dataset with the group of user. + Range in ``[None, user]`` and defaults to ``user``. + - ``order (str)``: Control the ordering of data and affect the splitting of data. + Range in ``['RS', 'LS']`` and defaults to ``RS``. + - ``mode (str)``: Control different candidates of ranking. + Range in ``[labeled, full,unixxx,popxxx]`` and defaults to ``full``. + +- ``repeatable (bool)``: Whether to evaluate the result with a repeatable recommendation scene. + Note that it is disabled for sequential models as the recommendation is already repeatable. + For other models, defaults to ``False``. + +Evaluation metrics +>>>>>>>>>>>>>>>>>>>>>>>>>> + +RecBole supports both value-based and ranking-based evaluation metrics. + +The value-based metrics (i.e., for rating prediction) include ``RMSE``, ``MAE``, +``AUC`` and ``LogLoss``, measuring the prediction difference between the true +and predicted values. + +The ranking-based metrics (i.e., for top-k item recommendation) include the most +common ranking-aware metrics, such as ``Recall``, ``Precision``, ``Hit``, +``NDCG``, ``MAP``, ``MRR`` and ``GAUC``, measuring the ranking performance of the +generated recommendation lists by an algorithm. Besides, several ranking-based +non-accuracy metrics are supported to evaluate in different views, such as +``ItemCoverage``, ``AveragePopularity``, ``GiniIndex``,``ShannonEntropy`` and ``TailPercentage`` +More details about metrics can refer to :doc:`/recbole/recbole.evaluator.metrics`. + +The parameters used to control the evaluation metrics are as follows: + +- ``metrics (list or str)``: Evaluation metrics. Defaults to + ``['Recall', 'MRR', 'NDCG', 'Hit', 'Precision']``. Range in + ``['Recall', 'MRR', 'NDCG', 'Hit', 'MAP', 'Precision', 'AUC', + 'MAE', 'RMSE', 'LogLoss', 'ItemCoverage', 'AveragePopularity', + 'GiniIndex','ShannonEntropy','TailPercentage']``. + Note that value-based metrics and ranking-based metrics can not be used together. +- ``topk (list or int or None)``: The value of k for topk evaluation metrics. + Defaults to ``10``. + +For more details about evaluation settings, please read :doc:`config/evaluation_settings` \ No newline at end of file diff --git a/docs/source/user_guide/usage.rst b/docs/source/user_guide/usage.rst index 4c0a0db1e..3aca00a84 100644 --- a/docs/source/user_guide/usage.rst +++ b/docs/source/user_guide/usage.rst @@ -1,6 +1,7 @@ Usage =================== -Here we introduce how to use RecBole. +In order to help users learn the depth usage of RecBole, we write the following usage docs +to give a detailed introduction about RecBole's features. .. toctree:: :maxdepth: 1 @@ -10,7 +11,8 @@ Here we introduce how to use RecBole. usage/parameter_tuning usage/running_new_dataset usage/running_different_models - usage/qa usage/load_pretrained_embedding usage/save_and_load_data_and_model - usage/case_study \ No newline at end of file + usage/case_study + usage/use_tensorboard + usage/qa \ No newline at end of file diff --git a/docs/source/user_guide/usage/qa.rst b/docs/source/user_guide/usage/qa.rst index f088c6802..451a9c753 100644 --- a/docs/source/user_guide/usage/qa.rst +++ b/docs/source/user_guide/usage/qa.rst @@ -22,15 +22,3 @@ For more rigorous evaluation, those user-item interaction records in validation Thus the distribution of validation & test sets may be inconsistent. However, this doesn't affect the comparison between models. - -**Q3** - -Why do I receive a warning about ``batch_size changed``? What is the meaning of :attr:`batch_size` in dataloader? - -**A3** - -In RecBole's dataloader, the meaning of :attr:`batch_size` is the upper bound of the number of **interactions** in one single batch. - -On the one hand, it's easy to calculate and control the usage of GPU memories. E.g., while comparing between different datasets, you don't need to change the value of :attr:`batch_size`, because the usage of GPU memories will not change a lot. - -On the other hand, in RecBole's top-k evaluation, we need the interactions of each user grouped in one batch. In other words, the interactions of any user should not be separated into multiple batches. We try to feed more interactions into one batch, but due to the above rules, the :attr:`batch_size` is just an upper bound. And :meth:`_batch_size_adaptation` is designed to adapt the actual batch size dynamically. Thus, while executing :meth:`_batch_size_adaptation`, you will receive a warning message. diff --git a/docs/source/user_guide/usage/running_different_models.rst b/docs/source/user_guide/usage/running_different_models.rst index e262a940d..48081dcc3 100644 --- a/docs/source/user_guide/usage/running_different_models.rst +++ b/docs/source/user_guide/usage/running_different_models.rst @@ -1,19 +1,14 @@ Running Different Models ========================== -Here, we present how to run different models in RecBole. +In RecBole, we have 4 categories of models, namely general recommendation, context-aware +recommendation, sequential recommendation and knowledge-based recommendation. Since different categories of models have different requirements for data +processing and evaluation setting, we need to configure these settings appropriately. -Proper Parameters Configuration ----------------------------------- -Since different categories of models have different requirements for data -processing and evaluation setting, we need to configure these settings -appropriately. +Here, we present some examples to show how to these four categories models in RecBole. -The following will introduce the parameter configuration of these four -categories of models: namely general recommendation, context-aware -recommendation, sequential recommendation and knowledge-based recommendation. General Recommendation -^^^^^^^^^^^^^^^^^^^^^^^^^^ +--------------------------------- **specify and load the user and item columns** @@ -35,19 +30,21 @@ corresponding column names. **training and evaluation settings** General recommendation models usually needs to group data by user and perform -negative sampling. +negative sampling. You can set the config like this: .. code:: yaml - group_by_user: True - training_neg_sample_num: 1 + eval_args: + group_by: user + neg_sampling: + uniform: 1 Context-aware Recommendation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +------------------------------------ **load the feature columns** -Context-aware recommendation models utilize the features of users, items and +Generally, context-aware recommendation models utilize the features of users, items and interactions to make CTR predictions, so it needs to load the used features. .. code:: yaml @@ -62,53 +59,25 @@ inter atomic file. **label setting** -We also need to configure `LABEL_FIELD`, which represents the label column in -the CTR prediction. For the Context-aware recommendation models, the setting of -`LABEL_FIELD` is divided into two cases: +In general, context-aware recommendation models mainly used in explicit feedback scenes, +so your data should have explicit feedback information and you need to set label for them. For more information about label setting, +please read the :doc:`../data/label_of_data`. -1) There is a label field in atomic file, and the value is in 0/1, we only need to -set as follows: +**evaluation settings** -.. code:: yaml - - LABEL_FIELD: label - -2) There is no label field in atomic file, we need to generate label field based -on some information. - -.. code:: yaml - - LABEL_FIELD: label - threshold: - rating: 3 - -`rating` is a column in atomic file and is loaded (by ``load_col``). In this way, -the label of the interaction with ``rating >= 3`` is set to 1, the reset are -set to 0. - -**training and evaluation settings** - -Context-aware recommendation models usually does not need to group data by user and -perform negative sampling. - -.. code:: yaml - - group_by_user: False - training_neg_sample_num: 0 - -Since there is no need to rank the results, ``eval_setting`` only needs to set -the first part, for example: - -.. code:: yaml - - eval_setting: RO_RS - -The evaluation metrics are generally set to `AUC` and `LogLoss`. +If you want to apply context-aware recommendation models for CTR predictions, you can set the config like: .. code:: yaml + eval_args: + group_by: None + mode: labeled metrics: ['AUC', 'LogLoss'] + valid_metric: AUC +Note that RecBole also supports to evaluate the context-aware recommendation models by full-ranking like general recommendation models, +but you need to make sure that your ``.inter`` file can not load any other context information column. + Sequential Recommendation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/docs/source/user_guide/usage/running_new_dataset.rst b/docs/source/user_guide/usage/running_new_dataset.rst index b986567f5..ef1794a82 100644 --- a/docs/source/user_guide/usage/running_new_dataset.rst +++ b/docs/source/user_guide/usage/running_new_dataset.rst @@ -1,21 +1,23 @@ Running New Dataset ======================= -Here, we present how to use a new dataset in RecBole. +RecBole has a build-in dataset **ml-100k** for users to quickly get start. +However, if you want to use new dataset, Here, we present how to use a new dataset in RecBole. -Convert to Atomic Files +Prepare atomic files ------------------------- -If the user use the collected datasets, she can choose one of the following ways: +In order to characterize most forms of the input data required by different recommendation tasks, +RecBole designs an input data format called :doc:`../data/atomic_files` and +you need to convert your raw data into Atomic Files format before data loading. -1. Download the converted atomic files from `Google Drive `_ or `Baidu Wangpan `_ (Password: e272). -2. Find the converting script from RecDatasets_, and transform them to atomic files. +For the convenience of users, we have collected more than +28 commonly used datasets (detailed as `Dataset List `_.) and released their Atomic Files format +for users to download them freely. More information of downloading our prepared datasets can be found in :doc:`../data/dataset_download`. -If the user use other datasets, she should format the data according to the format of the atomic files. +However, if the you use other datasets, you should convert your data into the Atomic Files by yourself. -.. _RecDatasets: https://github.com/RUCAIBox/RecDatasets - -For the dataset of ml-1m, the converting file is: +For the ml-1m dataset, the converted atomic files are like: **ml-1m.inter** @@ -45,11 +47,11 @@ item_id:token movie_title:token_seq release_year:token genre:token_seq ============= ===================== ================== ============================ -Local Path +Set data path --------------- -Name of atomic files, name of dir that containing atomic files and ``config['dataset']`` should be the same. - -``config['data_path']`` should be the parent dir of the dir that containing atomic files. +You need to set the data path in config when you want to use new dataset. +The name of atomic files, name of dir that containing atomic files and ``config['dataset']`` should be the same, and +the ``data_path`` in your config should be the parent dir of the dir that containing atomic files. For example: @@ -75,6 +77,7 @@ Suppose we use ml-1m to train BPR. According to the dataset information, the user should set the dataset information and filtering parameters in the configuration file `ml-1m.yaml`. For example, we conduct 10-core filtering, removing the ratings which are smaller than 3, the time of the record should be earlier than 97830000, and we only load inter data. +The ``yaml`` file should be like: .. code:: yaml @@ -86,11 +89,11 @@ For example, we conduct 10-core filtering, removing the ratings which are smalle load_col: inter: [user_id, item_id, rating, timestamp] - min_user_inter_num: 10 - min_item_inter_num: 10 - lowest_val: - rating: 3 - timestamp: 97830000 + user_inter_num_interval: "[10,inf)" + item_inter_num_interval: "[10,inf)" + val_interval: + rating: "[3,inf)" + timestamp: "[97830000, inf)" .. code:: python @@ -108,14 +111,16 @@ Convert to Dataloader Here, we present how to convert :class:`~recbole.data.dataset.dataset.Dataset` into :obj:`Dataloader`. We firstly set the parameters in the configuration file `ml-1m.yaml`. -We leverage random ordering + ratio-based splitting and full ranking with all item candidates, the splitting ratio is set as 8:1:1. +Suppose we want to leverage random ordering + ratio-based splitting and full ranking with all item candidates, the splitting ratio is set as 8:1:1. +You can add the following config in your `ml-1m.yaml`: .. code:: yaml - ... - - eval_setting: RO_RS,full - split_ratio: [0.8,0.1,0.1] + eval_args: + split: {'RS': [8,1,1]} + group_by: user + order: RO + mode: full .. code:: python diff --git a/docs/source/user_guide/usage/use_tensorboard.rst b/docs/source/user_guide/usage/use_tensorboard.rst new file mode 100644 index 000000000..97088560b --- /dev/null +++ b/docs/source/user_guide/usage/use_tensorboard.rst @@ -0,0 +1,21 @@ +Use Tensorboard +==================== + +In the latest release, RecBole allows tracking and visualizing train loss and valid score with TensorBoard. + +In Recbole, TensorBoard output to `./log_tensorboard/` directory by default. You can start TensorBoard with: + +.. code:: sh + + $ tensorboard --logdir=log_tensorboard + +Then, go to the URL it provides OR to http://localhost:6006/. You can see the following page. + +.. image:: ../../asset/tensorboard_1.png + +This dashboard shows how the train loss and valid score change with every epoch. + +You can also compare hyperparameters by switching to the 'HPAPAMS' page +from the header menu. It’s helpful to compare these metrics across different training runs to improve your model. + +.. image:: ../../asset/tensorboard_2.png \ No newline at end of file diff --git a/recbole/config/configurator.py b/recbole/config/configurator.py index d569aa4c5..b0a98bfe1 100644 --- a/recbole/config/configurator.py +++ b/recbole/config/configurator.py @@ -277,7 +277,7 @@ def _set_default_parameters(self): if self.final_config_dict['loss_type'] in ['CE']: if self.final_config_dict['MODEL_TYPE'] == ModelType.SEQUENTIAL and \ self.final_config_dict['neg_sampling'] is not None: - raise ValueError(f"neg_sampling [{self.final_config_dict['neg_sampling']}] should be 0 " + raise ValueError(f"neg_sampling [{self.final_config_dict['neg_sampling']}] should be None " f"when the loss_type is CE.") self.final_config_dict['MODEL_INPUT_TYPE'] = InputType.POINTWISE elif self.final_config_dict['loss_type'] in ['BPR']: diff --git a/recbole/data/dataloader/general_dataloader.py b/recbole/data/dataloader/general_dataloader.py index 6fb064bbb..cb64132f5 100644 --- a/recbole/data/dataloader/general_dataloader.py +++ b/recbole/data/dataloader/general_dataloader.py @@ -197,7 +197,6 @@ def _set_user_property(self, uid, used_item, positive_item): self.uid2positive_item[uid] = torch.tensor(list(positive_item), dtype=torch.int64) self.uid2items_num[uid] = len(positive_item) self.uid2history_item[uid] = torch.tensor(list(history_item), dtype=torch.int64) - def _init_batch_size_and_step(self): batch_size = self.config['eval_batch_size'] if not self.is_sequential: diff --git a/recbole/data/dataset/dataset.py b/recbole/data/dataset/dataset.py index 8d52a8004..649c4f7d6 100644 --- a/recbole/data/dataset/dataset.py +++ b/recbole/data/dataset/dataset.py @@ -325,7 +325,7 @@ def _load_additional_feat(self, token, dataset_path): For those additional features, e.g. pretrained entity embedding, user can set them as ``config['additional_feat_suffix']``, then they will be loaded and stored in - :attr:`feat_name_list`. See :doc:`../user_guide/data/data_args` for details. + :attr:`feat_name_list`. See :doc:`../user_guide/data/data_settings` for details. Args: token (str): dataset name. diff --git a/recbole/data/utils.py b/recbole/data/utils.py index 599fcba3a..4b98fb2c6 100644 --- a/recbole/data/utils.py +++ b/recbole/data/utils.py @@ -97,6 +97,7 @@ def data_preparation(config, dataset, save=False): """ model_type = config['MODEL_TYPE'] built_datasets = dataset.build() + logger = getLogger() train_dataset, valid_dataset, test_dataset = built_datasets train_sampler, valid_sampler, test_sampler = create_samplers(config, dataset, built_datasets) @@ -109,7 +110,16 @@ def data_preparation(config, dataset, save=False): valid_data = get_dataloader(config, 'evaluation')(config, valid_dataset, valid_sampler, shuffle=False) test_data = get_dataloader(config, 'evaluation')(config, test_dataset, test_sampler, shuffle=False) - + logger.info( + set_color('[Training]: ', 'pink') + set_color('train_batch_size', 'cyan') + ' = ' + + set_color(f'[{config["train_batch_size"]}]', 'yellow') + set_color(' negative sampling', 'cyan') + ': '+ + set_color(f'[{config["neg_sampling"]}]', 'yellow') + ) + logger.info( + set_color('[Evaluation]: ', 'pink') + set_color('eval_batch_size', 'cyan') + ' = ' + + set_color(f'[{config["eval_batch_size"]}]', 'yellow') + set_color(' eval_args', 'cyan') + ': '+ + set_color(f'[{config["eval_args"]}]', 'yellow') + ) if save: save_split_dataloaders(config, dataloaders=(train_data, valid_data, test_data)) diff --git a/recbole/sampler/sampler.py b/recbole/sampler/sampler.py index c0fc428bf..d4484693f 100644 --- a/recbole/sampler/sampler.py +++ b/recbole/sampler/sampler.py @@ -60,7 +60,7 @@ def _uni_sampling(self, sample_num): raise NotImplementedError('Method [_uni_sampling] should be implemented') def _get_candidates_list(self): - """Get sample candidates list + """Get sample candidates list for _pop_sampling() Returns: candidates_list (list): a list of candidates id. diff --git a/recbole/trainer/trainer.py b/recbole/trainer/trainer.py index 6ad8ec986..d6669d5fb 100644 --- a/recbole/trainer/trainer.py +++ b/recbole/trainer/trainer.py @@ -386,7 +386,6 @@ def _full_sort_batch_eval(self, batched_data): scores[:, 0] = -np.inf if history_index is not None: scores[history_index] = -np.inf - return interaction, scores, positive_u, positive_i def _neg_sample_batch_eval(self, batched_data):