Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEA & DOC: add encoding for reading atomic files and fix doc. #966

Merged
merged 4 commits into from
Sep 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/user_guide/config/environment_settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ Environment settings are designed to set basic parameters of running environment
- ``seed (int)`` : Random seed. Defaults to ``2020``.
- ``state (str)`` : Logging level. Defaults to ``'INFO'``.
Range in ``['INFO', 'DEBUG', 'WARNING', 'ERROR', 'CRITICAL']``.
- ``encoding (str)``: Encoding to use for reading atomic files. Defaults to ``'utf-8'``.
The available encoding can be found in `here <https://docs.python.org/3/library/codecs.html#standard-encodings>`__.
- ``reproducibility (bool)`` : If True, the tool will use deterministic
convolution algorithms, which makes the result reproducible. If False,
the tool will benchmark multiple convolution algorithms and select the fastest one,
Expand Down
1 change: 1 addition & 0 deletions recbole/data/dataloader/abstract_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ class NegSampleDataLoader(AbstractDataLoader):
sampler (Sampler): The sampler of dataloader.
shuffle (bool, optional): Whether the dataloader will be shuffle after a round. Defaults to ``False``.
"""

def __init__(self, config, dataset, sampler, shuffle=True):
super().__init__(config, dataset, sampler, shuffle=shuffle)

Expand Down
7 changes: 4 additions & 3 deletions recbole/data/dataloader/general_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class NegSampleEvalDataLoader(NegSampleDataLoader):
sampler (Sampler): The sampler of dataloader.
shuffle (bool, optional): Whether the dataloader will be shuffle after a round. Defaults to ``False``.
"""

def __init__(self, config, dataset, sampler, shuffle=False):
self._set_neg_sample_args(config, dataset, InputType.POINTWISE, config['eval_neg_sample_args'])
if self.neg_sample_args['strategy'] == 'by':
Expand Down Expand Up @@ -193,7 +194,7 @@ def _set_user_property(self, uid, used_item, positive_item):
if uid is None:
return
history_item = used_item - positive_item
self.uid2positive_item[uid] = torch.tensor(list(positive_item), dtype=torch.int64)
self.uid2positive_item[uid] = torch.tensor(list(positive_item), dtype=torch.int64)
self.uid2items_num[uid] = len(positive_item)
self.uid2history_item[uid] = torch.tensor(list(history_item), dtype=torch.int64)

Expand Down Expand Up @@ -222,7 +223,7 @@ def _next_batch_data(self):
if not self.is_sequential:
user_df = self.user_df[self.pr:self.pr + self.step]
uid_list = list(user_df[self.uid_field])

history_item = self.uid2history_item[uid_list]
positive_item = self.uid2positive_item[uid_list]

Expand All @@ -241,4 +242,4 @@ def _next_batch_data(self):
positive_i = interaction[self.iid_field]

self.pr += self.step
return interaction, None, positive_u, positive_i
return interaction, None, positive_u, positive_i
1 change: 1 addition & 0 deletions recbole/data/dataset/customized_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ class DIENDataset(SequentialDataset):
neg_item_list_field (str): Field name for negative item sequence.
neg_item_list (torch.tensor): all users' negative item history sequence.
"""

def __init__(self, config):
super().__init__(config)

Expand Down
51 changes: 33 additions & 18 deletions recbole/data/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,10 @@ def _get_download_url(self, url_file, allow_none=False):
elif allow_none:
return None
else:
raise ValueError(f'Neither [{self.dataset_path}] exists in the device'
f'nor [{self.dataset_name}] a known dataset name.')
raise ValueError(
f'Neither [{self.dataset_path}] exists in the device'
f'nor [{self.dataset_name}] a known dataset name.'
)

def _download(self):
url = self._get_download_url('url')
Expand Down Expand Up @@ -404,7 +406,8 @@ def _load_feat(self, filepath, source):
columns = []
usecols = []
dtype = {}
with open(filepath, 'r') as f:
encoding = self.config['encoding']
with open(filepath, 'r', encoding=encoding) as f:
head = f.readline()[:-1]
for field_type in head.split(field_separator):
field, ftype = field_type.split(':')
Expand All @@ -429,7 +432,9 @@ def _load_feat(self, filepath, source):
self.logger.warning(f'No columns has been loaded from [{source}]')
return None

df = pd.read_csv(filepath, delimiter=self.config['field_separator'], usecols=usecols, dtype=dtype)
df = pd.read_csv(
filepath, delimiter=self.config['field_separator'], usecols=usecols, dtype=dtype, encoding=encoding
)
df.columns = columns

seq_separator = self.config['seq_separator']
Expand Down Expand Up @@ -462,15 +467,19 @@ def _init_alias(self):
if alias_name_1 != alias_name_2:
intersect = np.intersect1d(alias_1, alias_2, assume_unique=True)
if len(intersect) > 0:
raise ValueError(f'`alias_of_{alias_name_1}` and `alias_of_{alias_name_2}` '
f'should not have the same field {list(intersect)}.')
raise ValueError(
f'`alias_of_{alias_name_1}` and `alias_of_{alias_name_2}` '
f'should not have the same field {list(intersect)}.'
)

self._rest_fields = self.token_like_fields
for alias_name, alias in self.alias.items():
isin = np.isin(alias, self._rest_fields, assume_unique=True)
if isin.all() is False:
raise ValueError(f'`alias_of_{alias_name}` should not contain '
f'non-token-like field {list(alias[~isin])}.')
raise ValueError(
f'`alias_of_{alias_name}` should not contain '
f'non-token-like field {list(alias[~isin])}.'
)
self._rest_fields = np.setdiff1d(self._rest_fields, alias, assume_unique=True)

def _user_item_feat_preparation(self):
Expand All @@ -484,7 +493,7 @@ def _user_item_feat_preparation(self):
if self.item_feat is not None:
new_item_df = pd.DataFrame({self.iid_field: np.arange(self.item_num)})
self.item_feat = pd.merge(new_item_df, self.item_feat, on=self.iid_field, how='left')
self.logger.debug(set_color('ordering item features by user id.', 'green'))
self.logger.debug(set_color('ordering item features by item id.', 'green'))

def _preload_weight_matrix(self):
"""Transfer preload weight features into :class:`numpy.ndarray` with shape ``[id_token_length]``
Expand Down Expand Up @@ -592,6 +601,7 @@ def _normalize(self):

for field in fields:
for feat in self.field2feats(field):

def norm(arr):
mx, mn = max(arr), min(arr)
if mx == mn:
Expand Down Expand Up @@ -675,14 +685,18 @@ def _filter_by_inter_num(self):
item_inter_num = Counter(self.inter_feat[self.iid_field].values) if item_inter_num_interval else Counter()

while True:
ban_users = self._get_illegal_ids_by_inter_num(field=self.uid_field,
feat=self.user_feat,
inter_num=user_inter_num,
inter_interval=user_inter_num_interval)
ban_items = self._get_illegal_ids_by_inter_num(field=self.iid_field,
feat=self.item_feat,
inter_num=item_inter_num,
inter_interval=item_inter_num_interval)
ban_users = self._get_illegal_ids_by_inter_num(
field=self.uid_field,
feat=self.user_feat,
inter_num=user_inter_num,
inter_interval=user_inter_num_interval
)
ban_items = self._get_illegal_ids_by_inter_num(
field=self.iid_field,
feat=self.item_feat,
inter_num=item_inter_num,
inter_interval=item_inter_num_interval
)

if len(ban_users) == 0 and len(ban_items) == 0:
break
Expand Down Expand Up @@ -722,7 +736,8 @@ def _get_illegal_ids_by_inter_num(self, field, feat, inter_num, inter_interval=N
set: illegal ids, whose inter num out of inter_intervals.
"""
self.logger.debug(
set_color('get_illegal_ids_by_inter_num', 'blue') + f': field=[{field}], inter_interval=[{inter_interval}]')
set_color('get_illegal_ids_by_inter_num', 'blue') + f': field=[{field}], inter_interval=[{inter_interval}]'
)

if inter_interval is not None:
if len(inter_interval) > 1:
Expand Down
2 changes: 1 addition & 1 deletion recbole/data/dataset/sequential_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def inter_matrix(self, form='coo', value_field=None):
for field in l1_inter_dict:
if field != self.uid_field and field + list_suffix in l1_inter_dict:
candidate_field_set.add(field)
new_dict[field] = torch.cat([self.inter_feat[field], l1_inter_dict[field + list_suffix][:,0]])
new_dict[field] = torch.cat([self.inter_feat[field], l1_inter_dict[field + list_suffix][:, 0]])
elif (not field.endswith(list_suffix)) and (field != self.item_list_length_field):
new_dict[field] = torch.cat([self.inter_feat[field], l1_inter_dict[field]])
local_inter_feat = Interaction(new_dict)
Expand Down
4 changes: 2 additions & 2 deletions recbole/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,12 @@ def data_preparation(config, dataset, save=False):
test_data = get_dataloader(config, 'evaluation')(config, test_dataset, test_sampler, shuffle=False)
logger.info(
set_color('[Training]: ', 'pink') + set_color('train_batch_size', 'cyan') + ' = ' +
set_color(f'[{config["train_batch_size"]}]', 'yellow') + set_color(' negative sampling', 'cyan') + ': '+
set_color(f'[{config["train_batch_size"]}]', 'yellow') + set_color(' negative sampling', 'cyan') + ': ' +
set_color(f'[{config["neg_sampling"]}]', 'yellow')
)
logger.info(
set_color('[Evaluation]: ', 'pink') + set_color('eval_batch_size', 'cyan') + ' = ' +
set_color(f'[{config["eval_batch_size"]}]', 'yellow') + set_color(' eval_args', 'cyan') + ': '+
set_color(f'[{config["eval_batch_size"]}]', 'yellow') + set_color(' eval_args', 'cyan') + ': ' +
set_color(f'[{config["eval_args"]}]', 'yellow')
)
if save:
Expand Down
8 changes: 6 additions & 2 deletions recbole/evaluator/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@
import torch
import copy


class DataStruct(object):

def __init__(self):
self._data_dict = {}

Expand Down Expand Up @@ -64,6 +66,7 @@ class Collector(object):
This class is only used in Trainer.

"""

def __init__(self, config):
self.config = config
self.data_struct = DataStruct()
Expand Down Expand Up @@ -123,7 +126,9 @@ def _average_rank(self, scores):

return avg_rank

def eval_batch_collect(self, scores_tensor: torch.Tensor, interaction, positive_u: torch.Tensor, positive_i: torch.Tensor):
def eval_batch_collect(
self, scores_tensor: torch.Tensor, interaction, positive_u: torch.Tensor, positive_i: torch.Tensor
):
""" Collect the evaluation resource from batched eval data and batched model output.
Args:
scores_tensor (Torch.Tensor): the output tensor of model with the shape of `(N, )`
Expand Down Expand Up @@ -173,7 +178,6 @@ def eval_batch_collect(self, scores_tensor: torch.Tensor, interaction, positive_
self.data_struct.update_tensor('data.label', interaction[self.label_field].to(self.device))

def model_collect(self, model: torch.nn.Module):

""" Collect the evaluation resource from model.
Args:
model (nn.Module): the trained recommendation model.
Expand Down
1 change: 0 additions & 1 deletion recbole/evaluator/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,3 @@ def evaluate(self, dataobject: DataStruct):
metric_val = self.metric_class[metric].calculate_metric(dataobject)
result_dict.update(metric_val)
return result_dict

9 changes: 9 additions & 0 deletions recbole/evaluator/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class Hit(TopkMetric):
:math:`\delta(·)` is an indicator function. :math:`\delta(b)` = 1 if :math:`b` is true and 0 otherwise.
:math:`\emptyset` denotes the empty set.
"""

def __init__(self, config):
super().__init__(config)

Expand All @@ -74,6 +75,7 @@ class MRR(TopkMetric):

:math:`{rank}_{u}^{*}` is the rank position of the first relevant item found by an algorithm for a user :math:`u`.
"""

def __init__(self, config):
super().__init__(config)

Expand Down Expand Up @@ -110,6 +112,7 @@ class MAP(TopkMetric):

:math:`\hat{R}_{j}(u)` is the j-th item in the recommendation list of \hat R (u)).
"""

def __init__(self, config):
super().__init__(config)
self.config = config
Expand Down Expand Up @@ -143,6 +146,7 @@ class Recall(TopkMetric):

:math:`|R(u)|` represents the item count of :math:`R(u)`.
"""

def __init__(self, config):
super().__init__(config)

Expand All @@ -169,6 +173,7 @@ class NDCG(TopkMetric):

:math:`\delta(·)` is an indicator function.
"""

def __init__(self, config):
super().__init__(config)

Expand Down Expand Up @@ -208,6 +213,7 @@ class Precision(TopkMetric):

:math:`|\hat R(u)|` represents the item count of :math:`\hat R(u)`.
"""

def __init__(self, config):
super().__init__(config)

Expand All @@ -223,6 +229,7 @@ def metric_info(self, pos_index):

# CTR Metrics


class GAUC(AbstractMetric):
r"""GAUC (also known as Grouped Area Under Curve) is used to evaluate the two-class model, referring to
the area under the ROC curve grouped by user. We weighted the index of each user :math:`u` by the number of positive
Expand Down Expand Up @@ -321,6 +328,7 @@ class AUC(LossMetric):
:math:`N` denotes the total number of user-item interactions.
:math:`rank_i` denotes the descending rank of the i-th positive item.
"""

def __init__(self, config):
super().__init__(config)

Expand Down Expand Up @@ -357,6 +365,7 @@ def metric_info(self, preds, trues):

# Loss-based Metrics


class MAE(LossMetric):
r"""MAE_ (also known as Mean Absolute Error regression loss) is used to evaluate the difference between
the score predicted by the model and the actual behavior of the user.
Expand Down
7 changes: 4 additions & 3 deletions recbole/evaluator/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@ def cluster_info(module_name):
"""
smaller_m = []
m_dict, m_info, m_types = {}, {}, {}
metric_class = inspect.getmembers(sys.modules[module_name],
lambda x: inspect.isclass(x) and x.__module__ == module_name)
metric_class = inspect.getmembers(
sys.modules[module_name], lambda x: inspect.isclass(x) and x.__module__ == module_name
)
for name, metric_cls in metric_class:
name = name.lower()
m_dict[name] = metric_cls
Expand All @@ -66,6 +67,7 @@ class Register(object):
It is a member of DataCollector.
The DataCollector collect the resource that need for Evaluator under the guidance of Register
"""

def __init__(self, config):

self.config = config
Expand All @@ -88,4 +90,3 @@ def need(self, key: str):
if hasattr(self, key):
return getattr(self, key)
return False

14 changes: 9 additions & 5 deletions recbole/model/abstract_recommender.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,15 @@ class ContextRecommender(AbstractRecommender):
def __init__(self, config, dataset):
super(ContextRecommender, self).__init__()

self.field_names = dataset.fields(source=[
FeatureSource.INTERACTION,
FeatureSource.USER, FeatureSource.USER_ID,
FeatureSource.ITEM, FeatureSource.ITEM_ID,
])
self.field_names = dataset.fields(
source=[
FeatureSource.INTERACTION,
FeatureSource.USER,
FeatureSource.USER_ID,
FeatureSource.ITEM,
FeatureSource.ITEM_ID,
]
)
self.LABEL = config['LABEL_FIELD']
self.embedding_size = config['embedding_size']
self.device = config['device']
Expand Down
2 changes: 1 addition & 1 deletion recbole/model/general_recommender/macridvae.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# @Email : [email protected]

# UPDATE
# @Time : 2021/6/30,
# @Time : 2021/6/30,
# @Author : Xingyu Pan
# @email : [email protected]

Expand Down
14 changes: 9 additions & 5 deletions recbole/model/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,11 +918,15 @@ class FMFirstOrderLinear(nn.Module):
def __init__(self, config, dataset, output_dim=1):

super(FMFirstOrderLinear, self).__init__()
self.field_names = dataset.fields(source=[
FeatureSource.INTERACTION,
FeatureSource.USER, FeatureSource.USER_ID,
FeatureSource.ITEM, FeatureSource.ITEM_ID,
])
self.field_names = dataset.fields(
source=[
FeatureSource.INTERACTION,
FeatureSource.USER,
FeatureSource.USER_ID,
FeatureSource.ITEM,
FeatureSource.ITEM_ID,
]
)
self.LABEL = config['LABEL_FIELD']
self.device = config['device']
self.token_field_names = []
Expand Down
Loading