Skip to content

Commit

Permalink
"update README, add more comment to code"
Browse files Browse the repository at this point in the history
  • Loading branch information
dzhwinter committed May 15, 2017
1 parent f3ebf87 commit 0bfa8ea
Show file tree
Hide file tree
Showing 8 changed files with 579 additions and 168 deletions.
347 changes: 346 additions & 1 deletion ltr/README.md

Large diffs are not rendered by default.

171 changes: 88 additions & 83 deletions ltr/lambdaRank.py
Original file line number Diff line number Diff line change
@@ -1,99 +1,104 @@
import os, sys
import gzip
import sqlite3
import paddle.v2 as paddle
import numpy as np
import functools

#lambdaRank is listwise learning to rank algorithm
#lambdaRank is listwise learning to rank model

def lambdaRank(feature_dim):
label = paddle.layer.data("label", paddle.data_type.integer_value_sequence(1))
data = paddle.layer.data("data", paddle.data_type.dense_vector(feature_dim))

# two hidden layers
hd1 = paddle.layer.fc(
name="/hidden_1",
input=data,
size=256,
act=paddle.activation.Tanh(),
param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1"))
hd2 = paddle.layer.fc(
name="/hidden_2",
input=hd1,
size=256,
act=paddle.activation.Tanh(),
param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w2"))
output = paddle.layer.fc(
name="/output",
input=hd2,
size=1,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(initial_std=0.01, name="output"))
cost = paddle.layer.lambda_cost(input=output,
score=label,
NDCG_num=10)
return cost, output

def lambdaRank(input_dim):
label = paddle.layer.data("label",
paddle.data_type.dense_vector_sequence(1))
data = paddle.layer.data("data",
paddle.data_type.dense_vector_sequence(input_dim))

# hidden layer
hd1 = paddle.layer.fc(
input=data,
size=10,
act=paddle.activation.Tanh(),
param_attr=paddle.attr.Param(initial_std=0.01))
output = paddle.layer.fc(
input=hd1,
size=1,
act=paddle.activation.Linear(),
param_attr=paddle.attr.Param(initial_std=0.01))
cost = paddle.layer.lambda_cost(
input=output, score=label, NDCG_num=6, max_sort_size=-1)
return cost, output


def train_lambdaRank(num_passes):
fill_default_train = functools.partial(paddle.dataset.mq2007.train, format="listwise")
fill_default_test = functools.partial(paddle.dataset.mq2007.test, format="listwise")
train_reader = paddle.batch(
paddle.reader.shuffle(fill_default_train, buf_size=1000), batch_size=1000)
test_reader = paddle.batch(
paddle.reader.shuffle(fill_default_test, buf_size=1000), batch_size=1000)
# listwise input sequence
fill_default_train = functools.partial(
paddle.dataset.mq2007.train, format="listwise")
fill_default_test = functools.partial(
paddle.dataset.mq2007.test, format="listwise")
train_reader = paddle.batch(
paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32)
test_reader = paddle.batch(
paddle.reader.shuffle(fill_default_test, buf_size=100), batch_size=32)

# mq2007 input_dim = 46, dense format
input_dim = 46
cost, output = lambdaRank(input_dim)
parameters = paddle.parameters.create(cost)

trainer = paddle.trainer.SGD(
cost=cost,
parameters=parameters,
update_equation=paddle.optimizer.Adam(learning_rate=1e-4))

# mq2007 feature_dim = 46, dense format
# fc hidden_dim = 128
feature_dim = 46
cost, output = lambdaRank(feature_dim)
parameters = paddle.parameters.create(cost)

trainer = paddle.trainer.SGD(
cost=cost,
parameters=parameters,
update_equation=paddle.optimizer.Adam(learning_rate=1e-4)
)
# Define end batch and end pass event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
print "Pass %d Batch %d Cost %.9f" % (event.pass_id, event.batch_id,
event.cost)
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
with gzip.open("lambdaRank_params_%d.tar.gz" % (event.pass_id),
"w") as f:
parameters.to_tar(f)

feeding = {"label": 0, "data": 1}
trainer.train(
reader=train_reader,
event_handler=event_handler,
feeding=feeding,
num_passes=num_passes)

# Define end batch and end pass event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d Batch %d Cost %.9f" % (
event.pass_id, event.batch_id, event.cost)
else:
sys.stdout.write(".")
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding)
print "\nTest with Pass %d, %s" %(event.pass_id, result.metrics)
with gzip.open("lambdaRank_params_%d.tar.gz" %(event.pass_id), "w") as f:
parameters.to_tar(f)
feeding = {"label":0,
"data": 1}
trainer.train(reader=train_reader,
event_handler=event_handler,
feeding=feeding,
num_passes=num_passes)

def lambdaRank_infer(pass_id):
print "Begin to Infer..."
feature_dim = 46
output = lambdaRnak(feature_dim)
parameters = paddle.parameters.Parameters.from_tar(gzip.open("lambdaRank_params_%d.tar.gz" %(pass_id-1)))
infer_data = []
infer_data_num = 1000
for label, left, right in paddle.dataset.mq2007.test():
infer_data.append(left)
if len(infer_data) == infer_data_num:
break
predicitons = paddle.infer(output_layer=output,
parameters=parameters,
input=infer_data)
for i, score in enumerate(predicitons):
print score
"""
lambdaRank model inference interface
parameters:
pass_id : inference model in pass_id
"""
print "Begin to Infer..."
input_dim = 46
output = lambdaRank(input_dim)
parameters = paddle.parameters.Parameters.from_tar(
gzip.open("lambdaRank_params_%d.tar.gz" % (pass_id - 1)))

infer_query_id = None
infer_data = []
infer_data_num = 1000
fill_default_test = functools.partial(
paddle.dataset.mq2007.test, format="listwise")
for label, querylist in fill_default_test():
infer_data.append(querylist)
if len(infer_data) == infer_data_num:
break
predicitons = paddle.infer(
output_layer=output, parameters=parameters, input=infer_data)
for i, score in enumerate(predicitons):
print score


if __name__ == '__main__':
paddle.init(use_gpu=False, trainer_count=4)
train_lambdaRank(2)
lambdaRank_infer(pass_id=2)
paddle.init(use_gpu=False, trainer_count=4)
train_lambdaRank(100)
lambdaRank_infer(pass_id=2)
Binary file added ltr/lambdarank.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added ltr/learningToRank.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
30 changes: 30 additions & 0 deletions ltr/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import numpy as np
import unittest


def ndcg(score_list):
def dcg(score_list):
n = len(score_list)
cost = .0
for i in range(n):
cost += float(score_list[i]) / np.log((i + 1) + 1)
return cost

dcg_cost = dcg(score_list)
score_ranking = sorted(score_list, reverse=True)
ideal_cost = dcg(score_ranking)
return dcg_cost / ideal_cost


class NdcgTest(unittest.TestCase):
def __init__(self):
pass

def runcase(self):
a = [3, 2, 3, 0, 1, 2]
value = ndcg(a)
self.assertAlmostEqual(0.961, value, places=3)


if __name__ == '__main__':
unittest.main()
Binary file added ltr/ranknet.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 0bfa8ea

Please sign in to comment.