diff --git a/ltr/lambdaRank.py b/ltr/lambdaRank.py index 1de249875a..7a213791f3 100644 --- a/ltr/lambdaRank.py +++ b/ltr/lambdaRank.py @@ -8,6 +8,15 @@ def lambdaRank(input_dim): + """ + lambdaRank is a ListWise Rank Model, input data and label must be sequence + https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf + parameters : + input_dim, one document's dense feature vector dimension + + dense_vector_sequence format + [[f, ...], [f, ...], ...], f is represent for an float or int number + """ label = paddle.layer.data("label", paddle.data_type.dense_vector_sequence(1)) data = paddle.layer.data("data", @@ -16,14 +25,24 @@ def lambdaRank(input_dim): # hidden layer hd1 = paddle.layer.fc( input=data, + size=128, + act=paddle.activation.Tanh(), + param_attr=paddle.attr.Param(initial_std=0.01)) + + hd2 = paddle.layer.fc( + input=hd1, size=10, act=paddle.activation.Tanh(), param_attr=paddle.attr.Param(initial_std=0.01)) output = paddle.layer.fc( - input=hd1, + input=hd2, size=1, act=paddle.activation.Linear(), param_attr=paddle.attr.Param(initial_std=0.01)) + + # evaluator + evaluator = paddle.evaluator.auc(input=output, label=label) + # cost layer cost = paddle.layer.lambda_cost( input=output, score=label, NDCG_num=6, max_sort_size=-1) return cost, output @@ -39,7 +58,7 @@ def train_lambdaRank(num_passes): paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32) test_reader = paddle.batch(fill_default_test, batch_size=32) - # mq2007 input_dim = 46, dense format + # mq2007 input_dim = 46, dense format input_dim = 46 cost, output = lambdaRank(input_dim) parameters = paddle.parameters.create(cost) @@ -83,20 +102,23 @@ def lambdaRank_infer(pass_id): infer_query_id = None infer_data = [] - infer_data_num = 1000 + infer_data_num = 1 fill_default_test = functools.partial( paddle.dataset.mq2007.test, format="listwise") for label, querylist in fill_default_test(): infer_data.append(querylist) if len(infer_data) == infer_data_num: break + + # predict score of infer_data document. Re-sort the document base on predict score + # in descending order. then we build the ranking documents predicitons = paddle.infer( output_layer=output, parameters=parameters, input=infer_data) for i, score in enumerate(predicitons): - print score + print i, score if __name__ == '__main__': paddle.init(use_gpu=False, trainer_count=4) - train_lambdaRank(100) - lambdaRank_infer(pass_id=2) + train_lambdaRank(2) + lambdaRank_infer(pass_id=1) diff --git a/ltr/ranknet.py b/ltr/ranknet.py index ca3b6c7023..862d5dea75 100644 --- a/ltr/ranknet.py +++ b/ltr/ranknet.py @@ -1,4 +1,5 @@ -import os, sys +import os +import sys import gzip import functools import paddle.v2 as paddle @@ -37,7 +38,7 @@ def half_ranknet(name_prefix, input_dim): def ranknet(input_dim): # label layer - label = paddle.layer.data("label", paddle.data_type.integer_value(1)) + label = paddle.layer.data("label", paddle.data_type.dense_vector(1)) # reuse the parameter in half_ranknet output_left = half_ranknet("left", input_dim) @@ -56,7 +57,7 @@ def train_ranknet(num_passes): batch_size=100) test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100) - # mq2007 feature_dim = 46, dense format + # mq2007 feature_dim = 46, dense format # fc hidden_dim = 128 feature_dim = 46 cost = ranknet(feature_dim) @@ -106,10 +107,9 @@ def ranknet_infer(pass_id): gzip.open("ranknet_params_%d.tar.gz" % (pass_id - 1))) # load data of same query and relevance documents, need ranknet to rank these candidates - infer_query_id = None + infer_query_id = [] infer_data = [] - infer_score_list = [] - infer_data_num = 1000 + infer_doc_index = [] # convert to mq2007 built-in data format # @@ -117,17 +117,19 @@ def ranknet_infer(pass_id): paddle.dataset.mq2007.test, format="plain_txt") for query_id, relevance_score, feature_vector in plain_txt_test(): - if infer_query_id == None: - infer_query_id = query_id - elif infer_query_id != query_id: - break + infer_query_id.append(query_id) infer_data.append(feature_vector) - predicitons = paddle.infer( + + # predict score of infer_data document. Re-sort the document base on predict score + # in descending order. then we build the ranking documents + scores = paddle.infer( output_layer=output, parameters=parameters, input=infer_data) + for query_id, score in zip(infer_query_id, scores): + print "query_id : ", query_id, " ranknet rank document order : ", score if __name__ == '__main__': paddle.init(use_gpu=False, trainer_count=4) - pass_num = 10 + pass_num = 2 train_ranknet(pass_num) ranknet_infer(pass_id=pass_num - 1)