-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathutils.py
108 lines (90 loc) · 4.25 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import numpy as np
import pandas as pd
### Utility function and class
class EarlyStopMonitor(object):
def __init__(self, max_round=3, higher_better=True, tolerance=1e-3):
self.max_round = max_round
self.num_round = 0
self.epoch_count = 0
self.best_epoch = 0
self.last_best = None
self.higher_better = higher_better
self.tolerance = tolerance
def early_stop_check(self, curr_val):
self.epoch_count += 1
if not self.higher_better:
curr_val *= -1
if self.last_best is None:
self.last_best = curr_val
elif (curr_val - self.last_best) / np.abs(self.last_best) > self.tolerance:
self.last_best = curr_val
self.num_round = 0
self.best_epoch = self.epoch_count
else:
self.num_round += 1
return self.num_round >= self.max_round
class RandEdgeSampler(object):
def __init__(self, src_list, dst_list, ts_list):
self.edges = {}
for u, i in zip(src_list, dst_list):
if u in self.edges:
self.edges[u].add(i)
else:
self.edges[u] = set([i])
self.src_list = np.unique(src_list)
self.dst_list = np.unique(dst_list)
df = pd.DataFrame({'u': src_list, 'i': dst_list, 'ts': ts_list})
self.items_popularity = df.groupby('i')['i'].count().to_dict()
self.time_sorted_df = df.sort_values(by=['ts'])
def sample(self, size):
src_index = np.random.randint(0, len(self.src_list), size)
dst_index = np.random.randint(0, len(self.dst_list), size)
return self.src_list[src_index], self.dst_list[dst_index]
def sample_neg(self, src_list):
dst_set = set(self.dst_list)
neg_dst = np.zeros(len(src_list), dtype=int)
for ind, u in enumerate(src_list):
random_neg = np.random.choice(list(dst_set-self.edges[u]), 1)[0]
neg_dst[ind] = random_neg
return neg_dst
def popularity_based_sample_neg(self, src_list):
dst_set = set(self.dst_list)
neg_dst = np.zeros(len(src_list), dtype=int)
for ind, u in enumerate(src_list):
neg_candidate = list(dst_set - self.edges[u])
neg_candidate_pop_items = []
neg_candidate_pop = []
for neg_item in neg_candidate:
neg_candidate_pop_items.append(neg_item)
neg_candidate_pop.append(self.items_popularity[neg_item])
total_popularity = sum(neg_candidate_pop)
neg_candidate_pop_prob = [pop / total_popularity for pop in neg_candidate_pop]
random_neg = np.random.choice(neg_candidate_pop_items, size=1, p=neg_candidate_pop_prob)
neg_dst[ind] = random_neg
return neg_dst
def timelypopularity_based_sample_neg(self, src_list, ts_list):
neg_dst = np.zeros(len(src_list), dtype=int)
range_ind = 1500
all_ts = self.time_sorted_df['ts'].values
for ind, u in enumerate(src_list):
ts_cut = ts_list[ind]
min_ts_diff_ind = np.argmin(abs(all_ts - ts_cut))
prev_min_ind = max(0, min_ts_diff_ind - range_ind)
later_max_ind = min(self.time_sorted_df.shape[0], min_ts_diff_ind + range_ind)
timely_selected_df = self.time_sorted_df.iloc[prev_min_ind:later_max_ind, :]
selected_items_popularity = timely_selected_df.groupby('i')['i'].count().to_dict()
neg_candidate = list(set(list(selected_items_popularity.keys())) - self.edges[u])
if len(neg_candidate) == 0:
print(selected_items_popularity)
print(self.edges[u])
print(prev_min_ind, later_max_ind)
neg_candidate_pop_items = []
neg_candidate_pop = []
for neg_item in neg_candidate:
neg_candidate_pop_items.append(neg_item)
neg_candidate_pop.append(selected_items_popularity[neg_item])
total_popularity = sum(neg_candidate_pop)
neg_candidate_pop_prob = [pop / total_popularity for pop in neg_candidate_pop]
random_neg = np.random.choice(neg_candidate_pop_items, size=1, p=neg_candidate_pop_prob)
neg_dst[ind] = random_neg
return neg_dst