-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathdata_util.py
118 lines (100 loc) · 4.47 KB
/
data_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#coding=utf8
'''
utils to process data for experiments
'''
import time
import logging
import numpy as np
class DataLoader(object):
'''
load the train and test data, including the representations for users and items, generated by meta-graph
input: the given filenames of train and test data
return: train_X, train_Y, test_X, test_Y.
Besides, can print the information of the data
'''
def __init__(self, config):
self.config = config
self.data_dir = config.get('data_dir')
self.train_filename = config.get('train_filename')
self.test_filename = config.get('test_filename')
self.N = config.get('N')
self.F = config.get('F')
self.L = config.get('L')
if config.get('dt') == 'synthetic':
self._load_random_data()
else:
self._load()
def _load_random_data(self):
self.train_X = np.loadtxt(self.data_dir + self.config.get('train_X'))
self.train_Y = np.loadtxt(self.data_dir + self.config.get('train_Y'))
self.test_X = np.loadtxt(self.data_dir + self.config.get('test_X'))
self.test_Y = np.loadtxt(self.data_dir + self.config.get('test_Y'))
def _load(self):
start_time = time.time()
train_data = np.loadtxt(self.data_dir + self.train_filename)
test_data = np.loadtxt(self.data_dir + self.test_filename)
train_num = train_data.shape[0]
test_num = test_data.shape[0]
uid2reps, bid2reps = self._load_representation()
self.train_X = np.zeros((train_num, self.N))
self.train_Y = train_data[:,2]
self.test_X = np.zeros((test_num, self.N))
self.test_Y = test_data[:,2]
ind = 0
for u, b, _ in train_data:
ur = uid2reps[int(u)]
br = bid2reps[int(b)]
self.train_X[ind] = np.concatenate((ur,br))
ind += 1
X_sparsity = np.count_nonzero(self.train_X) * 1.0 / self.train_X.size
ind = 0
for u, b, _ in test_data:
ur = uid2reps.get(int(u), np.zeros(self.N/2))
br = bid2reps.get(int(b), np.zeros(self.N/2))
self.test_X[ind] = np.concatenate((ur,br))
ind += 1
test_X_sparsity = np.count_nonzero(self.test_X) * 1.0 / self.test_X.size
def _generate_feature_files(self):
meta_graphs = self.config.get('meta_graphs')
topK = self.config.get('topK')
ufiles, vfiles = [], []
for graph in meta_graphs:
if graph == 'ratings_only':
ufiles.append('ratings_only_user.dat')
vfiles.append('ratings_only_item.dat')
else:
ufiles.append('%s_top%s_user.dat' % (graph, topK))
vfiles.append('%s_top%s_item.dat' % (graph, topK))
return ufiles, vfiles
def _load_representation(self):
'''
load user and item latent features generate by MF for every meta-graph
'''
#if dt in ['yelp-200k', 'amazon-200k', 'amazon-50k', 'amazon-100k', 'amazon-10k', 'amazon-5k', 'cikm-yelp', 'yelp-50k', 'yelp-10k', 'yelp-5k', 'yelp-100k', 'douban']:
fnum = self.N / 2
ufilename = self.data_dir + 'uids.txt'
bfilename = self.data_dir + 'bids.txt'
uids = [int(l.strip()) for l in open(ufilename, 'r').readlines()]
uid2reps = {k:np.zeros(fnum, dtype=np.float64) for k in uids}
bids = [int(l.strip()) for l in open(bfilename, 'r').readlines()]
bid2reps = {k:np.zeros(fnum, dtype=np.float64) for k in bids}
ufiles, vfiles = self._generate_feature_files()
feature_dir = self.data_dir + 'mf_features/path_count/'
for find, filename in enumerate(ufiles):
ufs = np.loadtxt(feature_dir + filename, dtype=np.float64)
cur = find * self.F
for uf in ufs:
uid = int(uf[0])
f = uf[1:]
uid2reps[uid][cur:cur+self.F] = f
for find, filename in enumerate(vfiles):
bfs = np.loadtxt(feature_dir + filename, dtype=np.float64)
cur = find * self.F
for bf in bfs:
bid = int(bf[0])
f = bf[1:]
bid2reps[bid][cur:cur+self.F] = f
logging.info('load all representations, len(ufiles)=%s, len(vfiles)=%s, ufiles=%s, vfiles=%s', len(ufiles), len(vfiles), '|'.join(ufiles), '|'.join(vfiles))
return uid2reps, bid2reps
def get_exp_data(self):
return self.train_X, self.train_Y, self.test_X, self.test_Y