-
Notifications
You must be signed in to change notification settings - Fork 2.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ltr case done. #31
ltr case done. #31
Changes from all commits
f3ebf87
0bfa8ea
97b0adf
b5bc989
c80ee11
2113a40
94a13bc
f24bb3f
ab766df
be548d9
561b6c8
a87a3c9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import os, sys | ||
import gzip | ||
import paddle.v2 as paddle | ||
import numpy as np | ||
import functools | ||
|
||
#lambdaRank is listwise learning to rank model | ||
|
||
|
||
def lambdaRank(input_dim): | ||
""" | ||
lambdaRank is a ListWise Rank Model, input data and label must be sequence | ||
https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf | ||
parameters : | ||
input_dim, one document's dense feature vector dimension | ||
|
||
dense_vector_sequence format | ||
[[f, ...], [f, ...], ...], f is represent for an float or int number | ||
""" | ||
label = paddle.layer.data("label", | ||
paddle.data_type.dense_vector_sequence(1)) | ||
data = paddle.layer.data("data", | ||
paddle.data_type.dense_vector_sequence(input_dim)) | ||
|
||
# hidden layer | ||
hd1 = paddle.layer.fc( | ||
input=data, | ||
size=128, | ||
act=paddle.activation.Tanh(), | ||
param_attr=paddle.attr.Param(initial_std=0.01)) | ||
|
||
hd2 = paddle.layer.fc( | ||
input=hd1, | ||
size=10, | ||
act=paddle.activation.Tanh(), | ||
param_attr=paddle.attr.Param(initial_std=0.01)) | ||
output = paddle.layer.fc( | ||
input=hd2, | ||
size=1, | ||
act=paddle.activation.Linear(), | ||
param_attr=paddle.attr.Param(initial_std=0.01)) | ||
|
||
# evaluator | ||
evaluator = paddle.evaluator.auc(input=output, label=label) | ||
# cost layer | ||
cost = paddle.layer.lambda_cost( | ||
input=output, score=label, NDCG_num=6, max_sort_size=-1) | ||
return cost, output | ||
|
||
|
||
def train_lambdaRank(num_passes): | ||
# listwise input sequence | ||
fill_default_train = functools.partial( | ||
paddle.dataset.mq2007.train, format="listwise") | ||
fill_default_test = functools.partial( | ||
paddle.dataset.mq2007.test, format="listwise") | ||
train_reader = paddle.batch( | ||
paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32) | ||
test_reader = paddle.batch(fill_default_test, batch_size=32) | ||
|
||
# mq2007 input_dim = 46, dense format | ||
input_dim = 46 | ||
cost, output = lambdaRank(input_dim) | ||
parameters = paddle.parameters.create(cost) | ||
|
||
trainer = paddle.trainer.SGD( | ||
cost=cost, | ||
parameters=parameters, | ||
update_equation=paddle.optimizer.Adam(learning_rate=1e-4)) | ||
|
||
# Define end batch and end pass event handler | ||
def event_handler(event): | ||
if isinstance(event, paddle.event.EndIteration): | ||
print "Pass %d Batch %d Cost %.9f" % (event.pass_id, event.batch_id, | ||
event.cost) | ||
if isinstance(event, paddle.event.EndPass): | ||
result = trainer.test(reader=test_reader, feeding=feeding) | ||
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) | ||
with gzip.open("lambdaRank_params_%d.tar.gz" % (event.pass_id), | ||
"w") as f: | ||
parameters.to_tar(f) | ||
|
||
feeding = {"label": 0, "data": 1} | ||
trainer.train( | ||
reader=train_reader, | ||
event_handler=event_handler, | ||
feeding=feeding, | ||
num_passes=num_passes) | ||
|
||
|
||
def lambdaRank_infer(pass_id): | ||
""" | ||
lambdaRank model inference interface | ||
parameters: | ||
pass_id : inference model in pass_id | ||
""" | ||
print "Begin to Infer..." | ||
input_dim = 46 | ||
output = lambdaRank(input_dim) | ||
parameters = paddle.parameters.Parameters.from_tar( | ||
gzip.open("lambdaRank_params_%d.tar.gz" % (pass_id - 1))) | ||
|
||
infer_query_id = None | ||
infer_data = [] | ||
infer_data_num = 1 | ||
fill_default_test = functools.partial( | ||
paddle.dataset.mq2007.test, format="listwise") | ||
for label, querylist in fill_default_test(): | ||
infer_data.append(querylist) | ||
if len(infer_data) == infer_data_num: | ||
break | ||
|
||
# predict score of infer_data document. Re-sort the document base on predict score | ||
# in descending order. then we build the ranking documents | ||
predicitons = paddle.infer( | ||
output_layer=output, parameters=parameters, input=infer_data) | ||
for i, score in enumerate(predicitons): | ||
print i, score | ||
|
||
|
||
if __name__ == '__main__': | ||
paddle.init(use_gpu=False, trainer_count=4) | ||
train_lambdaRank(2) | ||
lambdaRank_infer(pass_id=1) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import numpy as np | ||
import unittest | ||
|
||
|
||
def ndcg(score_list): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 写一些注释~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
""" | ||
measure the ndcg score of order list | ||
https://en.wikipedia.org/wiki/Discounted_cumulative_gain | ||
parameter: | ||
score_list: np.array, shape=(sample_num,1) | ||
|
||
e.g. predict rank score list : | ||
>>> scores = [3, 2, 3, 0, 1, 2] | ||
>>> ndcg_score = ndcg(scores) | ||
|
||
""" | ||
|
||
def dcg(score_list): | ||
n = len(score_list) | ||
cost = .0 | ||
for i in range(n): | ||
cost += float(score_list[i]) / np.log((i + 1) + 1) | ||
return cost | ||
|
||
dcg_cost = dcg(score_list) | ||
score_ranking = sorted(score_list, reverse=True) | ||
ideal_cost = dcg(score_ranking) | ||
return dcg_cost / ideal_cost | ||
|
||
|
||
class NdcgTest(unittest.TestCase): | ||
def __init__(self): | ||
pass | ||
|
||
def runcase(self): | ||
a = [3, 2, 3, 0, 1, 2] | ||
value = ndcg(a) | ||
self.assertAlmostEqual(0.961, value, places=3) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
import os | ||
import sys | ||
import gzip | ||
import functools | ||
import paddle.v2 as paddle | ||
import numpy as np | ||
from metrics import ndcg | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 没看哪里用了ndcg~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ndcg在training过程中,作为函数传不进去 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个是排序的基准函数,python里不能传递到training过程中 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 也就是说没用到? 文档中说明下metrics.py函数用途吧。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix done. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for the recommendation! |
||
|
||
# ranknet is the classic pairwise learning to rank algorithm | ||
# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf | ||
|
||
|
||
def half_ranknet(name_prefix, input_dim): | ||
""" | ||
parameter in same name will be shared in paddle framework, | ||
these parameters in ranknet can be used in shared state, e.g. left network and right network | ||
shared parameters in detail | ||
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md | ||
""" | ||
# data layer | ||
data = paddle.layer.data(name_prefix + "/data", | ||
paddle.data_type.dense_vector(input_dim)) | ||
|
||
# hidden layer | ||
hd1 = paddle.layer.fc( | ||
input=data, | ||
size=10, | ||
act=paddle.activation.Tanh(), | ||
param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1")) | ||
# fully connect layer/ output layer | ||
output = paddle.layer.fc( | ||
input=hd1, | ||
size=1, | ||
act=paddle.activation.Linear(), | ||
param_attr=paddle.attr.Param(initial_std=0.01, name="output")) | ||
return output | ||
|
||
|
||
def ranknet(input_dim): | ||
# label layer | ||
label = paddle.layer.data("label", paddle.data_type.dense_vector(1)) | ||
|
||
# reuse the parameter in half_ranknet | ||
output_left = half_ranknet("left", input_dim) | ||
output_right = half_ranknet("right", input_dim) | ||
|
||
evaluator = paddle.evaluator.auc(input=output_left, label=label) | ||
# rankcost layer | ||
cost = paddle.layer.rank_cost( | ||
name="cost", left=output_left, right=output_right, label=label) | ||
return cost | ||
|
||
|
||
def train_ranknet(num_passes): | ||
train_reader = paddle.batch( | ||
paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100), | ||
batch_size=100) | ||
test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100) | ||
|
||
# mq2007 feature_dim = 46, dense format | ||
# fc hidden_dim = 128 | ||
feature_dim = 46 | ||
cost = ranknet(feature_dim) | ||
parameters = paddle.parameters.create(cost) | ||
|
||
trainer = paddle.trainer.SGD( | ||
cost=cost, | ||
parameters=parameters, | ||
update_equation=paddle.optimizer.Adam(learning_rate=2e-4)) | ||
|
||
# Define the input data order | ||
feeding = {"label": 0, "left/data": 1, "right/data": 2} | ||
|
||
# Define end batch and end pass event handler | ||
def event_handler(event): | ||
if isinstance(event, paddle.event.EndIteration): | ||
if event.batch_id % 100 == 0: | ||
print "Pass %d Batch %d Cost %.9f" % ( | ||
event.pass_id, event.batch_id, event.cost) | ||
else: | ||
sys.stdout.write(".") | ||
sys.stdout.flush() | ||
if isinstance(event, paddle.event.EndPass): | ||
result = trainer.test(reader=test_reader, feeding=feeding) | ||
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) | ||
with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id), | ||
"w") as f: | ||
parameters.to_tar(f) | ||
|
||
trainer.train( | ||
reader=train_reader, | ||
event_handler=event_handler, | ||
feeding=feeding, | ||
num_passes=num_passes) | ||
|
||
|
||
def ranknet_infer(pass_id): | ||
""" | ||
load the trained model. And predict with plain txt input | ||
""" | ||
print "Begin to Infer..." | ||
feature_dim = 46 | ||
|
||
# we just need half_ranknet to predict a rank score, which can be used in sort documents | ||
output = half_ranknet("left", feature_dim) | ||
parameters = paddle.parameters.Parameters.from_tar( | ||
gzip.open("ranknet_params_%d.tar.gz" % (pass_id - 1))) | ||
|
||
# load data of same query and relevance documents, need ranknet to rank these candidates | ||
infer_query_id = [] | ||
infer_data = [] | ||
infer_doc_index = [] | ||
|
||
# convert to mq2007 built-in data format | ||
# <query_id> <relevance_score> <feature_vector> | ||
plain_txt_test = functools.partial( | ||
paddle.dataset.mq2007.test, format="plain_txt") | ||
|
||
for query_id, relevance_score, feature_vector in plain_txt_test(): | ||
infer_query_id.append(query_id) | ||
infer_data.append(feature_vector) | ||
|
||
# predict score of infer_data document. Re-sort the document base on predict score | ||
# in descending order. then we build the ranking documents | ||
scores = paddle.infer( | ||
output_layer=output, parameters=parameters, input=infer_data) | ||
for query_id, score in zip(infer_query_id, scores): | ||
print "query_id : ", query_id, " ranknet rank document order : ", score | ||
|
||
|
||
if __name__ == '__main__': | ||
paddle.init(use_gpu=False, trainer_count=4) | ||
pass_num = 2 | ||
train_ranknet(pass_num) | ||
ranknet_infer(pass_id=pass_num - 1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
注释对齐~
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这个是故意的空格,表示参数~
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done