Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mar262024-Deleted irrelevant Output Files | Implemented Leave One Out | Tried Uni 20 #66

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .run.slurm.swp
Binary file not shown.
44 changes: 44 additions & 0 deletions config_files/original_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
seed: 2022
field_separator: "\t"
source_domain:
dataset: "/home/dmeher/datasets/AmazonBooks/AmazonBooks"
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
RATING_FIELD: rating
TIME_FIELD: timestamp
NEG_PREFIX: neg_
LABEL_FIELD: rating
load_col:
inter: [user_id, item_id, rating]
user_inter_num_interval: "[10,inf)"
item_inter_num_interval: "[10,inf)"
val_interval:
rating: "[3,inf)"
drop_filter_field: True

target_domain:
dataset: "/home/dmeher/datasets/AmazonMov/AmazonMov"
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
RATING_FIELD: rating
TIME_FIELD: timestamp
NEG_PREFIX: neg_
LABEL_FIELD: rating
load_col:
inter: [user_id, item_id, rating]
user_inter_num_interval: "[10,inf)"
item_inter_num_interval: "[10,inf)"
val_interval:
rating: "[3,inf)"
drop_filter_field: True

epochs: 5
train_batch_size: 4096
eval_batch_size: 409600
valid_metric: RMSE
embedding_size: 64
eval_setting: TO_LS,full
metrics: ['RMSE', 'MAE']
eval_args:
mode: labeled
split_valid: {'LS':'valid_and_test'}
61 changes: 61 additions & 0 deletions config_files/ranking_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
seed: 2022
fiield_separator: "\t"
source_domain:
dataset: "/home/dmeher/datasets/AmazonBooks/AmazonBooks"
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
RATING_FIELD: rating
TIME_FIELD: timestamp
NEG_PREFIX: neg_
LABEL_FIELD: label
load_col:
inter: [user_id, item_id, rating]
user_inter_num_interval: "[10,inf)"
item_inter_num_interval: "[10,inf)"
val_interval:
rating: "[3,inf)"
drop_filter_field: True

target_domain:
dataset: "/home/dmeher/datasets/AmazonMov/AmazonMov"
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
RATING_FIELD: rating
TIME_FIELD: timestamp
NEG_PREFIX: neg_
LABEL_FIELD: label
load_col:
inter: [user_id, item_id, rating]
user_inter_num_interval: "[10,inf)"
item_inter_num_interval: "[10,inf)"
val_interval:
rating: "[3,inf)"
drop_filter_field: True

epochs: 1
train_batch_size: 4096
eval_batch_size: 409600
valid_metric: NDCG@10
embedding_size: 64

eval_args:
split: {'LS':'valid_and_test'}
mode: 'uni100'
#mode: 'full'



#epochs: 1
#train_batch_size: 4096
#eval_batch_size: 409600
#valid_metric: NDCG@10
#embedding_size: 64
#metrics: ["Recall","MRR","NDCG","Hit","Precision"] # (list or str) Evaluation metrics.
#topk: [10] # (list or int or None) The value of k for topk evaluation metrics.
#valid_metric: MRR@10 # (str) The evaluation metric for early stopping.
#valid_metric_bigger: True
#metric_decimal_place: 4
#eval_args:
# split: {'LS':'valid_and_test'}
# group_by: user
# mode: full
44 changes: 44 additions & 0 deletions config_files/rating_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
iseed: 2022
field_separator: "\t"
source_domain:
dataset: "/home/dmeher/datasets/AmazonBooks/AmazonBooks"
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
RATING_FIELD: rating
TIME_FIELD: timestamp
NEG_PREFIX: neg_
LABEL_FIELD: rating
load_col:
inter: [user_id, item_id, rating]
user_inter_num_interval: "[10,inf)"
item_inter_num_interval: "[10,inf)"
val_interval:
rating: "[3,inf)"
drop_filter_field: True

target_domain:
dataset: "/home/dmeher/datasets/AmazonMov/AmazonMov"
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
RATING_FIELD: rating
TIME_FIELD: timestamp
NEG_PREFIX: neg_
LABEL_FIELD: rating
load_col:
inter: [user_id, item_id, rating]
user_inter_num_interval: "[10,inf)"
item_inter_num_interval: "[10,inf)"
val_interval:
rating: "[3,inf)"
drop_filter_field: True

epochs: 5
train_batch_size: 4096
eval_batch_size: 409600
valid_metric: RMSE
embedding_size: 64
metrics: ['RMSE', 'MAE']
eval_setting: TO_LS,full
drop_filter_field: True
eval_args:
mode: labeled
76 changes: 73 additions & 3 deletions recbole_cdr/data/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,72 @@ def get_history_matrix(self, user_num, item_num, row, value_field=None):

return torch.LongTensor(history_matrix), torch.FloatTensor(history_value), torch.LongTensor(history_len)

def _split_index_by_leave_one_out(self, grouped_index, leave_one_num):
"""Split indexes by strategy leave one out.

Args:
grouped_index (list of list of int): Index to be split.
leave_one_num (int): Number of parts whose length is expected to be ``1``.

Returns:
list: List of index that has been split.
"""
next_index = [[] for _ in range(leave_one_num + 1)]
for index in grouped_index:
index = list(index)
tot_cnt = len(index)
legal_leave_one_num = min(leave_one_num, tot_cnt - 1)
pr = tot_cnt - legal_leave_one_num
next_index[0].extend(index[:pr])
for i in range(legal_leave_one_num):
next_index[-legal_leave_one_num + i].append(index[pr])
pr += 1
return next_index

def leave_one_out(self, group_by, leave_one_mode):
"""Split interaction records by leave one out strategy.

Args:
group_by (str): Field name that interaction records should grouped by before splitting.
leave_one_mode (str): The way to leave one out. It can only take three values:
'valid_and_test', 'valid_only' and 'test_only'.

Returns:
list: List of :class:`~Dataset`, whose interaction features has been split.
"""
self.logger.debug(
f"leave one out, group_by=[{group_by}], leave_one_mode=[{leave_one_mode}]"
)
if group_by is None:
raise ValueError("leave one out strategy require a group field")

grouped_inter_feat_index = self._grouped_index(
self.inter_feat[group_by].numpy()
)
if leave_one_mode == "valid_and_test":
next_index = self._split_index_by_leave_one_out(
grouped_inter_feat_index, leave_one_num=2
)
elif leave_one_mode == "valid_only":
next_index = self._split_index_by_leave_one_out(
grouped_inter_feat_index, leave_one_num=1
)
next_index.append([])
elif leave_one_mode == "test_only":
next_index = self._split_index_by_leave_one_out(
grouped_inter_feat_index, leave_one_num=1
)
next_index = [next_index[0], [], next_index[1]]
else:
raise NotImplementedError(
f"The leave_one_mode [{leave_one_mode}] has not been implemented."
)

self._drop_unused_col()
next_df = [self.inter_feat[index] for index in next_index]
next_ds = [self.copy(_) for _ in next_df]
return next_ds

def split_train_valid(self):
self._change_feat_format()

Expand All @@ -266,7 +332,7 @@ def split_train_valid(self):
raise NotImplementedError(f'The ordering_method [{ordering_args}] has not been implemented.')

# splitting & grouping
split_args = self.config['eval_args']['split_valid']
split_args = self.config['eval_args']['split']
if split_args is None:
raise ValueError('The split_args in eval_args should not be None.')
if not isinstance(split_args, dict):
Expand All @@ -284,9 +350,13 @@ def split_train_valid(self):
datasets = self.split_by_ratio(split_args['RS'], group_by=self.uid_field)
else:
raise NotImplementedError(f'The grouping method [{group_by}] has not been implemented.')
elif split_mode == "LS":
datasets = self.leave_one_out(
group_by=self.uid_field, leave_one_mode=split_args["LS"]
)
else:
raise NotImplementedError(f'The splitting_method [{split_mode}] has not been implemented.')

return datasets


Expand Down Expand Up @@ -563,7 +633,7 @@ def build(self):
return [source_domain_train_dataset, None, target_domain_train_dataset,
target_domain_valid_dataset, target_domain_test_dataset]
else:
source_domain_train_dataset, source_domain_valid_dataset = self.source_domain_dataset.split_train_valid()
source_domain_train_dataset, source_domain_valid_dataset, source_domain_test_dataset = self.source_domain_dataset.split_train_valid()
return [source_domain_train_dataset, source_domain_valid_dataset, target_domain_train_dataset,
target_domain_valid_dataset, target_domain_test_dataset]

Expand Down