-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #31 from dzhwinter/model_ltr2
Add the example for pairwise and listwise LTR.
- Loading branch information
Showing
8 changed files
with
669 additions
and
1 deletion.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import os, sys | ||
import gzip | ||
import paddle.v2 as paddle | ||
import numpy as np | ||
import functools | ||
|
||
#lambdaRank is listwise learning to rank model | ||
|
||
|
||
def lambdaRank(input_dim): | ||
""" | ||
lambdaRank is a ListWise Rank Model, input data and label must be sequence | ||
https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf | ||
parameters : | ||
input_dim, one document's dense feature vector dimension | ||
dense_vector_sequence format | ||
[[f, ...], [f, ...], ...], f is represent for an float or int number | ||
""" | ||
label = paddle.layer.data("label", | ||
paddle.data_type.dense_vector_sequence(1)) | ||
data = paddle.layer.data("data", | ||
paddle.data_type.dense_vector_sequence(input_dim)) | ||
|
||
# hidden layer | ||
hd1 = paddle.layer.fc( | ||
input=data, | ||
size=128, | ||
act=paddle.activation.Tanh(), | ||
param_attr=paddle.attr.Param(initial_std=0.01)) | ||
|
||
hd2 = paddle.layer.fc( | ||
input=hd1, | ||
size=10, | ||
act=paddle.activation.Tanh(), | ||
param_attr=paddle.attr.Param(initial_std=0.01)) | ||
output = paddle.layer.fc( | ||
input=hd2, | ||
size=1, | ||
act=paddle.activation.Linear(), | ||
param_attr=paddle.attr.Param(initial_std=0.01)) | ||
|
||
# evaluator | ||
evaluator = paddle.evaluator.auc(input=output, label=label) | ||
# cost layer | ||
cost = paddle.layer.lambda_cost( | ||
input=output, score=label, NDCG_num=6, max_sort_size=-1) | ||
return cost, output | ||
|
||
|
||
def train_lambdaRank(num_passes): | ||
# listwise input sequence | ||
fill_default_train = functools.partial( | ||
paddle.dataset.mq2007.train, format="listwise") | ||
fill_default_test = functools.partial( | ||
paddle.dataset.mq2007.test, format="listwise") | ||
train_reader = paddle.batch( | ||
paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32) | ||
test_reader = paddle.batch(fill_default_test, batch_size=32) | ||
|
||
# mq2007 input_dim = 46, dense format | ||
input_dim = 46 | ||
cost, output = lambdaRank(input_dim) | ||
parameters = paddle.parameters.create(cost) | ||
|
||
trainer = paddle.trainer.SGD( | ||
cost=cost, | ||
parameters=parameters, | ||
update_equation=paddle.optimizer.Adam(learning_rate=1e-4)) | ||
|
||
# Define end batch and end pass event handler | ||
def event_handler(event): | ||
if isinstance(event, paddle.event.EndIteration): | ||
print "Pass %d Batch %d Cost %.9f" % (event.pass_id, event.batch_id, | ||
event.cost) | ||
if isinstance(event, paddle.event.EndPass): | ||
result = trainer.test(reader=test_reader, feeding=feeding) | ||
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) | ||
with gzip.open("lambdaRank_params_%d.tar.gz" % (event.pass_id), | ||
"w") as f: | ||
parameters.to_tar(f) | ||
|
||
feeding = {"label": 0, "data": 1} | ||
trainer.train( | ||
reader=train_reader, | ||
event_handler=event_handler, | ||
feeding=feeding, | ||
num_passes=num_passes) | ||
|
||
|
||
def lambdaRank_infer(pass_id): | ||
""" | ||
lambdaRank model inference interface | ||
parameters: | ||
pass_id : inference model in pass_id | ||
""" | ||
print "Begin to Infer..." | ||
input_dim = 46 | ||
output = lambdaRank(input_dim) | ||
parameters = paddle.parameters.Parameters.from_tar( | ||
gzip.open("lambdaRank_params_%d.tar.gz" % (pass_id - 1))) | ||
|
||
infer_query_id = None | ||
infer_data = [] | ||
infer_data_num = 1 | ||
fill_default_test = functools.partial( | ||
paddle.dataset.mq2007.test, format="listwise") | ||
for label, querylist in fill_default_test(): | ||
infer_data.append(querylist) | ||
if len(infer_data) == infer_data_num: | ||
break | ||
|
||
# predict score of infer_data document. Re-sort the document base on predict score | ||
# in descending order. then we build the ranking documents | ||
predicitons = paddle.infer( | ||
output_layer=output, parameters=parameters, input=infer_data) | ||
for i, score in enumerate(predicitons): | ||
print i, score | ||
|
||
|
||
if __name__ == '__main__': | ||
paddle.init(use_gpu=False, trainer_count=4) | ||
train_lambdaRank(2) | ||
lambdaRank_infer(pass_id=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import numpy as np | ||
import unittest | ||
|
||
|
||
def ndcg(score_list): | ||
""" | ||
measure the ndcg score of order list | ||
https://en.wikipedia.org/wiki/Discounted_cumulative_gain | ||
parameter: | ||
score_list: np.array, shape=(sample_num,1) | ||
e.g. predict rank score list : | ||
>>> scores = [3, 2, 3, 0, 1, 2] | ||
>>> ndcg_score = ndcg(scores) | ||
""" | ||
|
||
def dcg(score_list): | ||
n = len(score_list) | ||
cost = .0 | ||
for i in range(n): | ||
cost += float(score_list[i]) / np.log((i + 1) + 1) | ||
return cost | ||
|
||
dcg_cost = dcg(score_list) | ||
score_ranking = sorted(score_list, reverse=True) | ||
ideal_cost = dcg(score_ranking) | ||
return dcg_cost / ideal_cost | ||
|
||
|
||
class NdcgTest(unittest.TestCase): | ||
def __init__(self): | ||
pass | ||
|
||
def runcase(self): | ||
a = [3, 2, 3, 0, 1, 2] | ||
value = ndcg(a) | ||
self.assertAlmostEqual(0.961, value, places=3) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
import os | ||
import sys | ||
import gzip | ||
import functools | ||
import paddle.v2 as paddle | ||
import numpy as np | ||
from metrics import ndcg | ||
|
||
# ranknet is the classic pairwise learning to rank algorithm | ||
# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf | ||
|
||
|
||
def half_ranknet(name_prefix, input_dim): | ||
""" | ||
parameter in same name will be shared in paddle framework, | ||
these parameters in ranknet can be used in shared state, e.g. left network and right network | ||
shared parameters in detail | ||
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md | ||
""" | ||
# data layer | ||
data = paddle.layer.data(name_prefix + "/data", | ||
paddle.data_type.dense_vector(input_dim)) | ||
|
||
# hidden layer | ||
hd1 = paddle.layer.fc( | ||
input=data, | ||
size=10, | ||
act=paddle.activation.Tanh(), | ||
param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1")) | ||
# fully connect layer/ output layer | ||
output = paddle.layer.fc( | ||
input=hd1, | ||
size=1, | ||
act=paddle.activation.Linear(), | ||
param_attr=paddle.attr.Param(initial_std=0.01, name="output")) | ||
return output | ||
|
||
|
||
def ranknet(input_dim): | ||
# label layer | ||
label = paddle.layer.data("label", paddle.data_type.dense_vector(1)) | ||
|
||
# reuse the parameter in half_ranknet | ||
output_left = half_ranknet("left", input_dim) | ||
output_right = half_ranknet("right", input_dim) | ||
|
||
evaluator = paddle.evaluator.auc(input=output_left, label=label) | ||
# rankcost layer | ||
cost = paddle.layer.rank_cost( | ||
name="cost", left=output_left, right=output_right, label=label) | ||
return cost | ||
|
||
|
||
def train_ranknet(num_passes): | ||
train_reader = paddle.batch( | ||
paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100), | ||
batch_size=100) | ||
test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100) | ||
|
||
# mq2007 feature_dim = 46, dense format | ||
# fc hidden_dim = 128 | ||
feature_dim = 46 | ||
cost = ranknet(feature_dim) | ||
parameters = paddle.parameters.create(cost) | ||
|
||
trainer = paddle.trainer.SGD( | ||
cost=cost, | ||
parameters=parameters, | ||
update_equation=paddle.optimizer.Adam(learning_rate=2e-4)) | ||
|
||
# Define the input data order | ||
feeding = {"label": 0, "left/data": 1, "right/data": 2} | ||
|
||
# Define end batch and end pass event handler | ||
def event_handler(event): | ||
if isinstance(event, paddle.event.EndIteration): | ||
if event.batch_id % 100 == 0: | ||
print "Pass %d Batch %d Cost %.9f" % ( | ||
event.pass_id, event.batch_id, event.cost) | ||
else: | ||
sys.stdout.write(".") | ||
sys.stdout.flush() | ||
if isinstance(event, paddle.event.EndPass): | ||
result = trainer.test(reader=test_reader, feeding=feeding) | ||
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) | ||
with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id), | ||
"w") as f: | ||
parameters.to_tar(f) | ||
|
||
trainer.train( | ||
reader=train_reader, | ||
event_handler=event_handler, | ||
feeding=feeding, | ||
num_passes=num_passes) | ||
|
||
|
||
def ranknet_infer(pass_id): | ||
""" | ||
load the trained model. And predict with plain txt input | ||
""" | ||
print "Begin to Infer..." | ||
feature_dim = 46 | ||
|
||
# we just need half_ranknet to predict a rank score, which can be used in sort documents | ||
output = half_ranknet("left", feature_dim) | ||
parameters = paddle.parameters.Parameters.from_tar( | ||
gzip.open("ranknet_params_%d.tar.gz" % (pass_id - 1))) | ||
|
||
# load data of same query and relevance documents, need ranknet to rank these candidates | ||
infer_query_id = [] | ||
infer_data = [] | ||
infer_doc_index = [] | ||
|
||
# convert to mq2007 built-in data format | ||
# <query_id> <relevance_score> <feature_vector> | ||
plain_txt_test = functools.partial( | ||
paddle.dataset.mq2007.test, format="plain_txt") | ||
|
||
for query_id, relevance_score, feature_vector in plain_txt_test(): | ||
infer_query_id.append(query_id) | ||
infer_data.append(feature_vector) | ||
|
||
# predict score of infer_data document. Re-sort the document base on predict score | ||
# in descending order. then we build the ranking documents | ||
scores = paddle.infer( | ||
output_layer=output, parameters=parameters, input=infer_data) | ||
for query_id, score in zip(infer_query_id, scores): | ||
print "query_id : ", query_id, " ranknet rank document order : ", score | ||
|
||
|
||
if __name__ == '__main__': | ||
paddle.init(use_gpu=False, trainer_count=4) | ||
pass_num = 2 | ||
train_ranknet(pass_num) | ||
ranknet_infer(pass_id=pass_num - 1) |