-
Notifications
You must be signed in to change notification settings - Fork 2.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ltr case done. #31
ltr case done. #31
Changes from 5 commits
f3ebf87
0bfa8ea
97b0adf
b5bc989
c80ee11
2113a40
94a13bc
f24bb3f
ab766df
be548d9
561b6c8
a87a3c9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
import os, sys | ||
import gzip | ||
import sqlite3 | ||
import paddle.v2 as paddle | ||
import numpy as np | ||
import functools | ||
|
||
#lambdaRank is listwise learning to rank model | ||
|
||
|
||
def lambdaRank(input_dim): | ||
label = paddle.layer.data("label", | ||
paddle.data_type.dense_vector_sequence(1)) | ||
data = paddle.layer.data("data", | ||
paddle.data_type.dense_vector_sequence(input_dim)) | ||
|
||
# hidden layer | ||
hd1 = paddle.layer.fc( | ||
input=data, | ||
size=10, | ||
act=paddle.activation.Tanh(), | ||
param_attr=paddle.attr.Param(initial_std=0.01)) | ||
output = paddle.layer.fc( | ||
input=hd1, | ||
size=1, | ||
act=paddle.activation.Linear(), | ||
param_attr=paddle.attr.Param(initial_std=0.01)) | ||
cost = paddle.layer.lambda_cost( | ||
input=output, score=label, NDCG_num=6, max_sort_size=-1) | ||
return cost, output | ||
|
||
|
||
def train_lambdaRank(num_passes): | ||
# listwise input sequence | ||
fill_default_train = functools.partial( | ||
paddle.dataset.mq2007.train, format="listwise") | ||
fill_default_test = functools.partial( | ||
paddle.dataset.mq2007.test, format="listwise") | ||
train_reader = paddle.batch( | ||
paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32) | ||
test_reader = paddle.batch( | ||
paddle.reader.shuffle(fill_default_test, buf_size=100), batch_size=32) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. test集不需要shuffle~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
|
||
# mq2007 input_dim = 46, dense format | ||
input_dim = 46 | ||
cost, output = lambdaRank(input_dim) | ||
parameters = paddle.parameters.create(cost) | ||
|
||
trainer = paddle.trainer.SGD( | ||
cost=cost, | ||
parameters=parameters, | ||
update_equation=paddle.optimizer.Adam(learning_rate=1e-4)) | ||
|
||
# Define end batch and end pass event handler | ||
def event_handler(event): | ||
if isinstance(event, paddle.event.EndIteration): | ||
print "Pass %d Batch %d Cost %.9f" % (event.pass_id, event.batch_id, | ||
event.cost) | ||
if isinstance(event, paddle.event.EndPass): | ||
result = trainer.test(reader=test_reader, feeding=feeding) | ||
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) | ||
with gzip.open("lambdaRank_params_%d.tar.gz" % (event.pass_id), | ||
"w") as f: | ||
parameters.to_tar(f) | ||
|
||
feeding = {"label": 0, "data": 1} | ||
trainer.train( | ||
reader=train_reader, | ||
event_handler=event_handler, | ||
feeding=feeding, | ||
num_passes=num_passes) | ||
|
||
|
||
def lambdaRank_infer(pass_id): | ||
""" | ||
lambdaRank model inference interface | ||
parameters: | ||
pass_id : inference model in pass_id | ||
""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 注释对齐~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个是故意的空格,表示参数~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
print "Begin to Infer..." | ||
input_dim = 46 | ||
output = lambdaRank(input_dim) | ||
parameters = paddle.parameters.Parameters.from_tar( | ||
gzip.open("lambdaRank_params_%d.tar.gz" % (pass_id - 1))) | ||
|
||
infer_query_id = None | ||
infer_data = [] | ||
infer_data_num = 1000 | ||
fill_default_test = functools.partial( | ||
paddle.dataset.mq2007.test, format="listwise") | ||
for label, querylist in fill_default_test(): | ||
infer_data.append(querylist) | ||
if len(infer_data) == infer_data_num: | ||
break | ||
predicitons = paddle.infer( | ||
output_layer=output, parameters=parameters, input=infer_data) | ||
for i, score in enumerate(predicitons): | ||
print score | ||
|
||
|
||
if __name__ == '__main__': | ||
paddle.init(use_gpu=False, trainer_count=4) | ||
train_lambdaRank(100) | ||
lambdaRank_infer(pass_id=2) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 觉得train和infer分开好些,python lambdaRank.py --train/infer 这样子? book里面放在一起我觉得是因为使用jupyter-notebook都是一个文件。 models里觉得可以分为两个步骤的~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 我觉得放在一起好一些。用户不需要再去看新的参数,直接运行,log会显示infer的过程。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 而且python 这种script类型的文件,可以随时更改 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 已经在文档中分节注释 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import numpy as np | ||
import unittest | ||
|
||
|
||
def ndcg(score_list): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 写一些注释~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok |
||
def dcg(score_list): | ||
n = len(score_list) | ||
cost = .0 | ||
for i in range(n): | ||
cost += float(score_list[i]) / np.log((i + 1) + 1) | ||
return cost | ||
|
||
dcg_cost = dcg(score_list) | ||
score_ranking = sorted(score_list, reverse=True) | ||
ideal_cost = dcg(score_ranking) | ||
return dcg_cost / ideal_cost | ||
|
||
|
||
class NdcgTest(unittest.TestCase): | ||
def __init__(self): | ||
pass | ||
|
||
def runcase(self): | ||
a = [3, 2, 3, 0, 1, 2] | ||
value = ndcg(a) | ||
self.assertAlmostEqual(0.961, value, places=3) | ||
|
||
|
||
if __name__ == '__main__': | ||
unittest.main() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
import os, sys | ||
import gzip | ||
import functools | ||
import paddle.v2 as paddle | ||
import numpy as np | ||
from metrics import ndcg | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 没看哪里用了ndcg~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ndcg在training过程中,作为函数传不进去 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个是排序的基准函数,python里不能传递到training过程中 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 也就是说没用到? 文档中说明下metrics.py函数用途吧。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix done. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks for the recommendation! |
||
|
||
# ranknet is the classic pairwise learning to rank algorithm | ||
# http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf | ||
|
||
|
||
def half_ranknet(name_prefix, input_dim): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 存在和上面配置同样的问题。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix Done. |
||
""" | ||
parameter in same name will be shared in paddle framework, | ||
these parameters in ranknet can be used in shared state, e.g. left network and right network | ||
shared parameters in detail | ||
https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md | ||
""" | ||
# data layer | ||
data = paddle.layer.data(name_prefix + "/data", | ||
paddle.data_type.dense_vector(input_dim)) | ||
|
||
# hidden layer | ||
hd1 = paddle.layer.fc( | ||
input=data, | ||
size=10, | ||
act=paddle.activation.Tanh(), | ||
param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1")) | ||
# fully connect layer/ output layer | ||
output = paddle.layer.fc( | ||
input=hd1, | ||
size=1, | ||
act=paddle.activation.Linear(), | ||
param_attr=paddle.attr.Param(initial_std=0.01, name="output")) | ||
return output | ||
|
||
|
||
def ranknet(input_dim): | ||
# label layer | ||
label = paddle.layer.data("label", paddle.data_type.integer_value(1)) | ||
|
||
# reuse the parameter in half_ranknet | ||
output_left = half_ranknet("left", input_dim) | ||
output_right = half_ranknet("right", input_dim) | ||
|
||
evaluator = paddle.evaluator.auc(input=output_left, label=label) | ||
# rankcost layer | ||
cost = paddle.layer.rank_cost( | ||
name="cost", left=output_left, right=output_right, label=label) | ||
return cost | ||
|
||
|
||
def train_ranknet(num_passes): | ||
train_reader = paddle.batch( | ||
paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100), | ||
batch_size=100) | ||
test_reader = paddle.batch( | ||
paddle.reader.buffered(paddle.dataset.mq2007.test, size=100), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 为什么train_reader没有使用paddle.reader.buffered,而test用了? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 因为test不用shuffle,使用buffered比lambda函数接口用的人员门槛更低 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. buffered是有特殊用途的,并不是为了避免"lambda函数接口"设计的。book里很多例子也不用lambda函数的,如果非得用lambda,还是paddle.dataset.mq2007.test没有写正确~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 噢 fixit |
||
batch_size=100) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. batch_size=100是随意设置的嘛? 这里是100,上面lambdaRank是32~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 因为lambdaRank对的是List,数量比rankNet少得多。否则看不到训练过程 |
||
|
||
# mq2007 feature_dim = 46, dense format | ||
# fc hidden_dim = 128 | ||
feature_dim = 46 | ||
cost = ranknet(feature_dim) | ||
parameters = paddle.parameters.create(cost) | ||
|
||
trainer = paddle.trainer.SGD( | ||
cost=cost, | ||
parameters=parameters, | ||
update_equation=paddle.optimizer.Adam(learning_rate=2e-4)) | ||
|
||
# Define the input data order | ||
feeding = {"label": 0, "left/data": 1, "right/data": 2} | ||
|
||
# Define end batch and end pass event handler | ||
def event_handler(event): | ||
if isinstance(event, paddle.event.EndIteration): | ||
if event.batch_id % 100 == 0: | ||
print "Pass %d Batch %d Cost %.9f" % ( | ||
event.pass_id, event.batch_id, event.cost) | ||
else: | ||
sys.stdout.write(".") | ||
sys.stdout.flush() | ||
if isinstance(event, paddle.event.EndPass): | ||
result = trainer.test(reader=test_reader, feeding=feeding) | ||
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) | ||
with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id), | ||
"w") as f: | ||
parameters.to_tar(f) | ||
|
||
trainer.train( | ||
reader=train_reader, | ||
event_handler=event_handler, | ||
feeding=feeding, | ||
num_passes=num_passes) | ||
|
||
|
||
def ranknet_infer(pass_id): | ||
""" | ||
load the trained model. And predict with plain txt input | ||
""" | ||
print "Begin to Infer..." | ||
feature_dim = 46 | ||
|
||
# we just need half_ranknet to predict a rank score, which can be used in sort documents | ||
output = half_ranknet("left", feature_dim) | ||
parameters = paddle.parameters.Parameters.from_tar( | ||
gzip.open("ranknet_params_%d.tar.gz" % (pass_id - 1))) | ||
|
||
# load data of same query and relevance documents, need ranknet to rank these candidates | ||
infer_query_id = None | ||
infer_data = [] | ||
infer_score_list = [] | ||
infer_data_num = 1000 | ||
|
||
# convert to mq2007 built-in data format | ||
# <query_id> <relevance_score> <feature_vector> | ||
plain_txt_test = functools.partial( | ||
paddle.dataset.mq2007.test, format="plain_txt") | ||
|
||
for query_id, relevance_score, feature_vector in plain_txt_test(): | ||
if infer_query_id == None: | ||
infer_query_id = query_id | ||
elif infer_query_id != query_id: | ||
break | ||
infer_data.append(feature_vector) | ||
predicitons = paddle.infer( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 预测只返回predicitons,没有任何说明和打印信息,不知道predicitons是啥~ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add more helper information |
||
output_layer=output, parameters=parameters, input=infer_data) | ||
|
||
|
||
if __name__ == '__main__': | ||
paddle.init(use_gpu=False, trainer_count=4) | ||
pass_num = 10 | ||
train_ranknet(pass_num) | ||
ranknet_infer(pass_id=pass_num - 1) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sqlite3没用的话去掉
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fix
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fix done