-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_loader.py
260 lines (225 loc) · 11.1 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import numpy as np
import scipy.io as sio
from torch.utils.data import Dataset, DataLoader
from utils import TT_split, normalize,knn
import torch
import random
from numpy.random import randint
from sklearn.preprocessing import OneHotEncoder
def load_data(dataset, neg_prop, aligned_prop, complete_prop, is_noise):
all_data = []
train_pairs = []
label = []
mat = sio.loadmat('./datasets/' + dataset + '.mat')
if dataset == 'Scene15':
data = mat['X'][0][0:2] # 20, 59 dimensions
# print(len(data[1]))
label = np.squeeze(mat['Y'])
print(data[1].shape)
elif dataset == 'Caltech101':
data = mat['X'][0][3:5]
label = np.squeeze(mat['Y'])
elif dataset == 'Reuters_dim10':
data = [] # 18758 samples
data.append(normalize(np.vstack((mat['x_train'][0], mat['x_test'][0]))))
data.append(normalize(np.vstack((mat['x_train'][1], mat['x_test'][1]))))
label = np.squeeze(np.hstack((mat['y_train'], mat['y_test'])))
elif dataset == 'NoisyMNIST-30000':
data = []
data.append(mat['X1'])
data.append(mat['X2'])
label = np.squeeze(mat['Y'])
# deep features of Caltech101
elif dataset == '2view-caltech101-8677sample':
data = []
label = np.squeeze(mat['gt'])
data.append(mat['X'][0][0].T)
data.append(mat['X'][0][1].T)
elif dataset == 'MNIST-USPS':
data = []
data.append(mat['X1'])
data.append(normalize(mat['X2']))
label = np.squeeze(mat['Y'])
# deep features of Animal
elif dataset == 'AWA-7view-10158sample':
data = []
label = np.squeeze(mat['gt'])
data.append(mat['X'][0][5].T)
data.append(mat['X'][0][6].T)
print("加载的数据集为:", dataset)
# random.seed(1)
divide_seed = random.randint(1, 1000)
print('k=',divide_seed)
train_idx, test_idx = TT_split(len(label), 1 - aligned_prop, divide_seed)
train_label, test_label = label[train_idx], label[test_idx]
train_X, train_Y, test_X, test_Y = data[0][train_idx], data[1][train_idx], data[0][test_idx], data[1][test_idx]
# Use test_prop*sizeof(all data) to train the MvCLN, and shuffle the rest data to simulate the unaligned data.
#使用 test_propsizeof(all data) 来训练 MvCLN,并将其余数据打乱以模拟未对齐的数据
# Note that, MvCLN establishes the correspondence of the all data rather than the unaligned portion in the testing.
#请注意,MvCLN 建立了所有数据的对应关系,而不是测试中未对齐的部分
# When test_prop = 0, MvCLN is directly performed on the all data without shuffling.
#当 test_prop = 0 时,直接对所有数据执行 MvCLN,无需打乱
if aligned_prop == 1:
all_data.append(train_X.T)
all_data.append(train_Y.T)
all_label, all_label_X, all_label_Y = train_label, train_label, train_label
else:
shuffle_idx = random.sample(range(len(test_Y)), len(test_Y))
test_Y = test_Y[shuffle_idx]
test_label_X, test_label_Y = test_label, test_label[shuffle_idx]
all_data.append(np.concatenate((train_X, test_X)).T)
all_data.append(np.concatenate((train_Y, test_Y)).T)
all_label = np.concatenate((train_label, test_label))
all_label_X = np.concatenate((train_label, test_label_X))
all_label_Y = np.concatenate((train_label, test_label_Y))
test_mask = get_sn(2, len(test_label), 1 - complete_prop)
if aligned_prop == 1.:
mask = test_mask
else:
identy_mask = np.ones((len(train_label), 2))
mask = np.concatenate((identy_mask, test_mask))
# pair construction. view 0 and 1 refer to pairs constructed for training. noisy and real labels refer to 0/1 label of those pairs
#对构造。视图 0 和 1 指的是为训练而构建的对。嘈杂和真实的标签是指这些对的 01 标签
if aligned_prop == 1.:
valid_idx = np.logical_and(mask[:, 0], mask[:, 1])
else:
valid_idx = np.ones_like(train_label).astype(np.bool_)
view0, view1, noisy_labels, real_labels, _, _ = \
get_pairs(train_X[valid_idx], train_Y[valid_idx], neg_prop, train_label[valid_idx])
count = 0
for i in range(len(noisy_labels)):
if noisy_labels[i] != real_labels[i]:
count += 1
print('noise rate of the constructed neg. pairs is ', round(count / (len(noisy_labels) - len(train_X)), 2))
if is_noise: # training with noisy negative correspondence
print("----------------------Training with noisy_labels----------------------")
train_pair_labels = noisy_labels
else: # training with gt negative correspondence
print("----------------------Training with real_labels----------------------")
train_pair_labels = real_labels
train_pairs.append(view0.T)
train_pairs.append(view1.T)
train_pair_real_labels = real_labels
# print(all_data)
return train_pairs, train_pair_labels, train_pair_real_labels, all_data, all_label, all_label_X, all_label_Y, divide_seed, mask
def get_pairs(train_X, train_Y, neg_prop, train_label):
view0, view1, labels, real_labels, class_labels0, class_labels1 = [], [], [], [], [], []
# construct pos. pairs
for i in range(len(train_X)):
view0.append(train_X[i])
view1.append(train_Y[i])
labels.append(1)
real_labels.append(1)
class_labels0.append(train_label[i])
class_labels1.append(train_label[i])
# construct neg. pairs by taking each sample in view0 as an anchor and randomly sample neg_prop samples from view1,
# which may lead to the so called noisy labels, namely, some of the constructed neg. pairs may in the same category.
for j in range(len(train_X)):
neg_idx = random.sample(range(len(train_Y)), neg_prop)
for k in range(neg_prop):
view0.append(train_X[j])
view1.append(train_Y[neg_idx[k]])
labels.append(0)
class_labels0.append(train_label[j])
class_labels1.append(train_label[neg_idx[k]])
if train_label[j] != train_label[neg_idx[k]]:
real_labels.append(0)
else:
real_labels.append(1)
labels = np.array(labels, dtype=np.int64)
real_labels = np.array(real_labels, dtype=np.int64)
class_labels0, class_labels1 = np.array(class_labels0, dtype=np.int64), np.array(class_labels1, dtype=np.int64)
view0, view1 = np.array(view0, dtype=np.float32), np.array(view1, dtype=np.float32)
return view0, view1, labels, real_labels, class_labels0, class_labels1
def get_sn(view_num, alldata_len, missing_rate):
"""Randomly generate incomplete data information, simulate partial view data with complete view data
随机生成不完整数据信息,用完整视图数据模拟部分视图数据
:param view_num:view number
:param alldata_len:number of samples
:param missing_rate:Defined in section 4.3 of the paper
:return:Sn
"""
missing_rate = missing_rate / 2
one_rate = 1.0 - missing_rate
if one_rate <= (1 / view_num):
enc = OneHotEncoder() # n_values=view_num
view_preserve = enc.fit_transform(randint(0, view_num, size=(alldata_len, 1))).toarray()
return view_preserve
error = 1
if one_rate == 1:
matrix = randint(1, 2, size=(alldata_len, view_num))
return matrix
while error >= 0.005:
enc = OneHotEncoder() # n_values=view_num
view_preserve = enc.fit_transform(randint(0, view_num, size=(alldata_len, 1))).toarray()
one_num = view_num * alldata_len * one_rate - alldata_len
ratio = one_num / (view_num * alldata_len)
matrix_iter = (randint(0, 100, size=(alldata_len, view_num)) < int(ratio * 100)).astype(np.int)
a = np.sum(((matrix_iter + view_preserve) > 1).astype(np.int))
one_num_iter = one_num / (1 - a / one_num)
ratio = one_num_iter / (view_num * alldata_len)
matrix_iter = (randint(0, 100, size=(alldata_len, view_num)) < int(ratio * 100)).astype(np.int)
matrix = ((matrix_iter + view_preserve) > 0).astype(np.int)
ratio = np.sum(matrix) / (view_num * alldata_len)
error = abs(one_rate - ratio)
return matrix
class getDataset(Dataset):
def __init__(self, data, labels, real_labels):
self.data = data
self.labels = labels
self.real_labels = real_labels
def __getitem__(self, index):
fea0, fea1 = (torch.from_numpy(self.data[0][:, index])).type(torch.FloatTensor), (
torch.from_numpy(self.data[1][:, index])).type(torch.FloatTensor)
fea0, fea1 = fea0.unsqueeze(0), fea1.unsqueeze(0)
label = np.int64(self.labels[index])
if len(self.real_labels) == 0:
return fea0, fea1, label
real_label = np.int64(self.real_labels[index])
return fea0, fea1, label, real_label
def __len__(self):
return len(self.labels)
class getAllDataset(Dataset):
def __init__(self, data, labels, class_labels0, class_labels1, mask):
self.data = data
self.labels = labels
self.class_labels0 = class_labels0
self.class_labels1 = class_labels1
self.mask = mask
def __getitem__(self, index):
fea0, fea1 = (torch.from_numpy(self.data[0][:, index])).type(torch.FloatTensor), (
torch.from_numpy(self.data[1][:, index])).type(torch.FloatTensor)
fea0, fea1 = fea0.unsqueeze(0), fea1.unsqueeze(0)
label = np.int64(self.labels[index])
class_labels0 = np.int64(self.class_labels0[index])
class_labels1 = np.int64(self.class_labels1[index])
mask = np.int64(self.mask[index])
return fea0, fea1, label, class_labels0, class_labels1, mask
def __len__(self):
return len(self.labels)
def loader(train_bs, neg_prop, aligned_prop, complete_prop, is_noise, dataset):
"""
:param train_bs: batch size for training, default is 1024
:param neg_prop: negative / positive pairs' ratio,负正对比率
:param aligned_prop: known aligned proportions for training MCAC,用于训练的已知对齐比例
:param complete_prop: known complete proportions for training MCAC,已知完整的训练比例
:param is_noise: training with noisy labels or not, 0 --- not, 1 --- yes,是否使用嘈杂的标签进行训练
:param dataset: choice of dataset
:return: train_pair_loader including the constructed pos. and neg. pairs used for training MvCLN, all_loader including originally aligned and unaligned data used for testing MvCLN
"""
train_pairs, train_pair_labels, train_pair_real_labels, all_data, all_label, all_label_X, all_label_Y, \
divide_seed, mask = load_data(dataset, neg_prop, aligned_prop, complete_prop, is_noise)
train_pair_dataset = getDataset(train_pairs, train_pair_labels, train_pair_real_labels)
all_dataset = getAllDataset(all_data, all_label, all_label_X, all_label_Y, mask)
train_pair_loader = DataLoader(
train_pair_dataset,
batch_size=train_bs,
shuffle=True,
drop_last=True
)
all_loader = DataLoader(
all_dataset,
batch_size=1024,
shuffle=True
)
return train_pair_loader, all_loader, divide_seed