-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathclick_simulation_feed.py
300 lines (258 loc) · 12.8 KB
/
click_simulation_feed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
"""Simulate click data based on human annotations.
See the following paper for more information on the simulation data.
* Qingyao Ai, Keping Bi, Cheng Luo, Jiafeng Guo, W. Bruce Croft. 2018. Unbiased Learning to Rank with Unbiased Propensity Estimation. In Proceedings of SIGIR '18
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import random
import sys
import time
import json
import numpy as np
from ultra.input_layer import BaseInputFeed
from ultra.utils import click_models as cm
import ultra.utils
import tensorflow as tf
# We disable pylint because we need python3 compatibility.
from six.moves import zip # pylint: disable=redefined-builtin
class ClickSimulationFeed(BaseInputFeed):
"""Simulate clicks based on human annotations.
This class implements a input layer for unbiased learning to rank experiments
by simulating click data based on both the human relevance annotation of
each query-document pair and a predefined click model.
"""
def __init__(self, model, batch_size, hparam_str, session=None):
"""Create the model.
Args:
model: (BasicModel) The model we are going to train.
batch_size: the size of the batches generated in each iteration.
hparam_str: the hyper-parameters for the input layer.
"""
self.hparams = ultra.utils.hparams.HParams(
# the setting file for the predefined click models.
click_model_json='./example/ClickModel/pbm_0.1_1.0_4_1.0.json',
# Set True to feed relevance labels instead of simulated clicks.
oracle_mode=False,
# Set eta change step for dynamic bias severity in training, 0.0
# means no change.
dynamic_bias_eta_change=0.0,
# Set how many steps to change eta for dynamic bias severity in
# training, 0.0 means no change.
dynamic_bias_step_interval=1000,
)
print('Create simluated clicks feed')
print(hparam_str)
self.hparams.parse(hparam_str)
self.click_model = None
if not self.hparams.oracle_mode:
with open(self.hparams.click_model_json) as fin:
model_desc = json.load(fin)
self.click_model = cm.loadModelFromJson(model_desc)
self.start_index = 0
self.count = 1
self.rank_list_size = model.rank_list_size
self.feature_size = model.feature_size
self.batch_size = batch_size
self.model = model
self.global_batch_count = 0
def prepare_sim_clicks_with_index(
self, data_set, index, docid_inputs, letor_features, labels, check_validation=True):
i = index
# Generate clicks with click models.
label_list = [
0 if data_set.initial_list[i][x] < 0 else data_set.labels[i][x] for x in range(
self.rank_list_size)]
click_list = None
if self.hparams.oracle_mode:
click_list = label_list
else:
click_list, _, _ = self.click_model.sampleClicksForOneList(
list(label_list))
#sample_count = 0
# while check_validation and sum(click_list) == 0 and sample_count < self.MAX_SAMPLE_ROUND_NUM:
# click_list, _, _ = self.click_model.sampleClicksForOneList(list(label_list))
# sample_count += 1
# Check if data is valid
if check_validation:
if sum(click_list) == 0:
return
base = len(letor_features)
for x in range(self.rank_list_size):
if data_set.initial_list[i][x] >= 0:
letor_features.append(
data_set.features[data_set.initial_list[i][x]])
docid_inputs.append(list([-1 if data_set.initial_list[i][x]
< 0 else base + x for x in range(self.rank_list_size)]))
labels.append(click_list)
def get_batch(self, data_set, check_validation=False):
"""Get a random batch of data, prepare for step. Typically used for training.
To feed data in step(..) it must be a list of batch-major vectors, while
data here contains single length-major cases. So the main logic of this
function is to re-index data cases to be in the proper format for feeding.
Args:
data_set: (Raw_data) The dataset used to build the input layer.
check_validation: (bool) Set True to ignore data with no positive labels.
Returns:
input_feed: a feed dictionary for the next step
info_map: a dictionary contain some basic information about the batch (for debugging).
"""
if len(data_set.initial_list[0]) < self.rank_list_size:
raise ValueError("Input ranklist length must be no less than the required list size,"
" %d != %d." % (len(data_set.initial_list[0]), self.rank_list_size))
length = len(data_set.initial_list)
docid_inputs, letor_features, labels = [], [], []
rank_list_idxs = []
batch_num = len(docid_inputs)
while len(docid_inputs) < self.batch_size:
i = int(random.random() * length)
self.prepare_sim_clicks_with_index(data_set, i,
docid_inputs, letor_features, labels, check_validation)
if batch_num < len(docid_inputs): # new list added
rank_list_idxs.append(i)
batch_num = len(docid_inputs)
local_batch_size = len(docid_inputs)
letor_features_length = len(letor_features)
for i in range(local_batch_size):
for j in range(self.rank_list_size):
if docid_inputs[i][j] < 0:
docid_inputs[i][j] = letor_features_length
batch_docid_inputs = []
batch_labels = []
for length_idx in range(self.rank_list_size):
# Batch encoder inputs are just re-indexed docid_inputs.
batch_docid_inputs.append(
np.array([docid_inputs[batch_idx][length_idx]
for batch_idx in range(local_batch_size)], dtype=np.float32))
# Batch decoder inputs are re-indexed decoder_inputs, we create
# labels.
batch_labels.append(
np.array([labels[batch_idx][length_idx]
for batch_idx in range(local_batch_size)], dtype=np.float32))
# Create input feed map
input_feed = {}
input_feed[self.model.letor_features.name] = np.array(letor_features)
for l in range(self.rank_list_size):
input_feed[self.model.docid_inputs[l].name] = batch_docid_inputs[l]
input_feed[self.model.labels[l].name] = batch_labels[l]
# Create info_map to store other information
info_map = {
'rank_list_idxs': rank_list_idxs,
'input_list': docid_inputs,
'click_list': labels,
'letor_features': letor_features
}
self.global_batch_count += 1
if self.hparams.dynamic_bias_eta_change != 0 and not self.hparams.oracle_mode:
if self.global_batch_count % self.hparams.dynamic_bias_step_interval == 0:
self.click_model.eta += self.hparams.dynamic_bias_eta_change
self.click_model.setExamProb(self.click_model.eta)
print(
'Dynamically change bias severity eta to %.3f' %
self.click_model.eta)
return input_feed, info_map
def get_next_batch(self, index, data_set, check_validation=False):
"""Get the next batch of data from a specific index, prepare for step.
Typically used for validation.
To feed data in step(..) it must be a list of batch-major vectors, while
data here contains single length-major cases. So the main logic of this
function is to re-index data cases to be in the proper format for feeding.
Args:
index: the index of the data before which we will use to create the data batch.
data_set: (Raw_data) The dataset used to build the input layer.
check_validation: (bool) Set True to ignore data with no positive labels.
Returns:
input_feed: a feed dictionary for the next step
info_map: a dictionary contain some basic information about the batch (for debugging).
"""
if len(data_set.initial_list[0]) < self.rank_list_size:
raise ValueError("Input ranklist length must be no less than the required list size,"
" %d != %d." % (len(data_set.initial_list[0]), self.rank_list_size))
docid_inputs, letor_features, labels = [], [], []
num_remain_data = len(data_set.initial_list) - index
for offset in range(min(self.batch_size, num_remain_data)):
i = index + offset
self.prepare_sim_clicks_with_index(
data_set, i, docid_inputs, letor_features, labels, check_validation)
local_batch_size = len(docid_inputs)
letor_features_length = len(letor_features)
for i in range(local_batch_size):
for j in range(self.rank_list_size):
if docid_inputs[i][j] < 0:
docid_inputs[i][j] = letor_features_length
batch_docid_inputs = []
batch_labels = []
for length_idx in range(self.rank_list_size):
# Batch encoder inputs are just re-indexed docid_inputs.
batch_docid_inputs.append(
np.array([docid_inputs[batch_idx][length_idx]
for batch_idx in range(local_batch_size)], dtype=np.float32))
# Batch decoder inputs are re-indexed decoder_inputs, we create
# weights.
batch_labels.append(
np.array([labels[batch_idx][length_idx]
for batch_idx in range(local_batch_size)], dtype=np.float32))
# Create input feed map
input_feed = {}
input_feed[self.model.letor_features.name] = np.array(letor_features)
for l in range(self.rank_list_size):
input_feed[self.model.docid_inputs[l].name] = batch_docid_inputs[l]
input_feed[self.model.labels[l].name] = batch_labels[l]
# Create others_map to store other information
others_map = {
'input_list': docid_inputs,
'click_list': labels,
}
return input_feed, others_map
def get_data_by_index(self, data_set, index, check_validation=False):
"""Get one data from the specified index, prepare for step.
Args:
data_set: (Raw_data) The dataset used to build the input layer.
index: the index of the data
check_validation: (bool) Set True to ignore data with no positive labels.
Returns:
The triple (docid_inputs, decoder_inputs, target_weights) for
the constructed batch that has the proper format to call step(...) later.
"""
if len(data_set.initial_list[0]) < self.rank_list_size:
raise ValueError("Input ranklist length must be no less than the required list size,"
" %d != %d." % (len(data_set.initial_list[0]), self.rank_list_size))
docid_inputs, letor_features, labels = [], [], []
i = index
self.prepare_sim_clicks_with_index(
data_set,
i,
docid_inputs,
letor_features,
labels,
check_validation)
letor_features_length = len(letor_features)
for j in range(self.rank_list_size):
if docid_inputs[-1][j] < 0:
docid_inputs[-1][j] = letor_features_length
batch_docid_inputs = []
batch_labels = []
for length_idx in range(self.rank_list_size):
# Batch encoder inputs are just re-indexed docid_inputs.
batch_docid_inputs.append(
np.array([docid_inputs[batch_idx][length_idx]
for batch_idx in range(1)], dtype=np.float32))
# Batch decoder inputs are re-indexed decoder_inputs, we create
# weights.
batch_labels.append(
np.array([labels[batch_idx][length_idx]
for batch_idx in range(1)], dtype=np.float32))
# Create input feed map
input_feed = {}
input_feed[self.model.letor_features.name] = np.array(letor_features)
for l in range(self.rank_list_size):
input_feed[self.model.docid_inputs[l].name] = batch_docid_inputs[l]
input_feed[self.model.labels[l].name] = batch_labels[l]
# Create others_map to store other information
others_map = {
'input_list': docid_inputs,
'click_list': labels,
}
return input_feed, others_map