From f6c3431b2b630098f64052e642d20de54b74f94d Mon Sep 17 00:00:00 2001 From: caoying03 Date: Thu, 23 Nov 2017 11:36:50 +0800 Subject: [PATCH] fix LTR example. --- ltr/lambda_rank.py | 114 ++++++++++++++++++++++----------- ltr/ranknet.py | 119 ++++++++++++++++++++++++----------- ltr/run_lambdarank.sh | 11 ---- ltr/run_ranknet.sh | 11 ---- text_classification/utils.py | 42 ++++++------- 5 files changed, 178 insertions(+), 119 deletions(-) delete mode 100644 ltr/run_lambdarank.sh delete mode 100644 ltr/run_ranknet.sh diff --git a/ltr/lambda_rank.py b/ltr/lambda_rank.py index 2314e3031ab..49d643c0623 100644 --- a/ltr/lambda_rank.py +++ b/ltr/lambda_rank.py @@ -3,10 +3,14 @@ import gzip import functools import argparse +import logging import numpy as np import paddle.v2 as paddle +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + def lambda_rank(input_dim, is_infer): """ @@ -26,43 +30,39 @@ def lambda_rank(input_dim, is_infer): data = paddle.layer.data("data", paddle.data_type.dense_vector_sequence(input_dim)) - # Define hidden layer. - hd1 = paddle.layer.fc( - input=data, - size=128, - act=paddle.activation.Tanh(), - param_attr=paddle.attr.Param(initial_std=0.01)) - - hd2 = paddle.layer.fc( - input=hd1, - size=10, - act=paddle.activation.Tanh(), - param_attr=paddle.attr.Param(initial_std=0.01)) - output = paddle.layer.fc( - input=hd2, - size=1, - act=paddle.activation.Linear(), - param_attr=paddle.attr.Param(initial_std=0.01)) + # Define the hidden layer. + hd1 = paddle.layer.fc(input=data, + size=128, + act=paddle.activation.Tanh(), + param_attr=paddle.attr.Param(initial_std=0.01)) + + hd2 = paddle.layer.fc(input=hd1, + size=10, + act=paddle.activation.Tanh(), + param_attr=paddle.attr.Param(initial_std=0.01)) + output = paddle.layer.fc(input=hd2, + size=1, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param(initial_std=0.01)) if not is_infer: - # Define evaluator. - evaluator = paddle.evaluator.auc(input=output, label=label) - # Define cost layer. + # Define the cost layer. cost = paddle.layer.lambda_cost( input=output, score=label, NDCG_num=6, max_sort_size=-1) return cost, output return output -def train_lambda_rank(num_passes): - # The input for LambdaRank is a sequence. +def lambda_rank_train(num_passes, model_save_dir): + # The input for LambdaRank must be a sequence. fill_default_train = functools.partial( paddle.dataset.mq2007.train, format="listwise") fill_default_test = functools.partial( paddle.dataset.mq2007.test, format="listwise") train_reader = paddle.batch( - paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32) + paddle.reader.shuffle( + fill_default_train, buf_size=100), batch_size=32) test_reader = paddle.batch(fill_default_test, batch_size=32) # Training dataset: mq2007, input_dim = 46, dense format. @@ -78,13 +78,15 @@ def train_lambda_rank(num_passes): # Define end batch and end pass event handler. def event_handler(event): if isinstance(event, paddle.event.EndIteration): - print "Pass %d Batch %d Cost %.9f" % (event.pass_id, event.batch_id, - event.cost) + logger.info("Pass %d Batch %d Cost %.9f" % + (event.pass_id, event.batch_id, event.cost)) if isinstance(event, paddle.event.EndPass): result = trainer.test(reader=test_reader, feeding=feeding) - print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) - with gzip.open("lambda_rank_params_%d.tar.gz" % (event.pass_id), - "w") as f: + logger.info("\nTest with Pass %d, %s" % + (event.pass_id, result.metrics)) + with gzip.open( + os.path.join(model_save_dir, "lambda_rank_params_%d.tar.gz" + % (event.pass_id)), "w") as f: trainer.save_parameter_to_tar(f) feeding = {"label": 0, "data": 1} @@ -95,17 +97,17 @@ def event_handler(event): num_passes=num_passes) -def lambda_rank_infer(pass_id): +def lambda_rank_infer(test_model_path): """LambdaRank model inference interface. Parameters: - pass_id : inference model in pass_id + test_model_path : The path of the trained model. """ - print "Begin to Infer..." + logger.info("Begin to Infer...") input_dim = 46 output = lambda_rank(input_dim, is_infer=True) parameters = paddle.parameters.Parameters.from_tar( - gzip.open("lambda_rank_params_%d.tar.gz" % (pass_id - 1))) + gzip.open(test_model_path)) infer_query_id = None infer_data = [] @@ -128,15 +130,51 @@ def lambda_rank_infer(pass_id): if __name__ == '__main__': - parser = argparse.ArgumentParser(description='LambdaRank demo') - parser.add_argument("--run_type", type=str, help="run type is train|infer") + parser = argparse.ArgumentParser( + description="PaddlePaddle LambdaRank example.") + parser.add_argument( + "--run_type", + type=str, + help=("A flag indicating to run the training or the inferring task. " + "Available options are: train or infer."), + default="train") parser.add_argument( "--num_passes", type=int, - help="The Num of passes in train| infer pass number of model.") + help="The number of passes to train the model.", + default=10) + parser.add_argument( + "--use_gpu", + type=bool, + help="A flag indicating whether to use the GPU device in training.", + default=False) + parser.add_argument( + "--trainer_count", + type=int, + help="The thread number used in training.", + default=1) + parser.add_argument( + "--model_save_dir", + type=str, + required=False, + help=("The path to save the trained models."), + default="models") + parser.add_argument( + "--test_model_path", + type=str, + required=False, + help=("This parameter works only in inferring task to " + "specify path of a trained model."), + default="") + args = parser.parse_args() - paddle.init(use_gpu=False, trainer_count=1) + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) if args.run_type == "train": - train_lambda_rank(args.num_passes) + lambda_rank_train(args.num_passes, args.model_save_dir) elif args.run_type == "infer": - lambda_rank_infer(pass_id=args.num_passes - 1) + assert os.path.exists(args.test_model_path), ( + "The trained model does not exit. Please set a correct path.") + lambda_rank_infer(args.test_model_path) + else: + logger.fatal(("A wrong value for parameter run type. " + "Available options are: train or infer.")) diff --git a/ltr/ranknet.py b/ltr/ranknet.py index 339bb34cfbb..b0667034154 100644 --- a/ltr/ranknet.py +++ b/ltr/ranknet.py @@ -2,10 +2,15 @@ import sys import gzip import functools -import paddle.v2 as paddle +import argparse +import logging import numpy as np + +import paddle.v2 as paddle from metrics import ndcg -import argparse + +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) # ranknet is the classic pairwise learning to rank algorithm # http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf @@ -23,17 +28,18 @@ def half_ranknet(name_prefix, input_dim): paddle.data_type.dense_vector(input_dim)) # hidden layer - hd1 = paddle.layer.fc( - input=data, - size=10, - act=paddle.activation.Tanh(), - param_attr=paddle.attr.Param(initial_std=0.01, name="hidden_w1")) - # fully connect layer/ output layer - output = paddle.layer.fc( - input=hd1, - size=1, - act=paddle.activation.Linear(), - param_attr=paddle.attr.Param(initial_std=0.01, name="output")) + hd1 = paddle.layer.fc(input=data, + size=10, + act=paddle.activation.Tanh(), + param_attr=paddle.attr.Param( + initial_std=0.01, name="hidden_w1")) + + # fully connected layer and output layer + output = paddle.layer.fc(input=hd1, + size=1, + act=paddle.activation.Linear(), + param_attr=paddle.attr.Param( + initial_std=0.01, name="output")) return output @@ -45,16 +51,16 @@ def ranknet(input_dim): output_left = half_ranknet("left", input_dim) output_right = half_ranknet("right", input_dim) - evaluator = paddle.evaluator.auc(input=output_left, label=label) # rankcost layer cost = paddle.layer.rank_cost( name="cost", left=output_left, right=output_right, label=label) return cost -def train_ranknet(num_passes): +def ranknet_train(num_passes, model_save_dir): train_reader = paddle.batch( - paddle.reader.shuffle(paddle.dataset.mq2007.train, buf_size=100), + paddle.reader.shuffle( + paddle.dataset.mq2007.train, buf_size=100), batch_size=100) test_reader = paddle.batch(paddle.dataset.mq2007.test, batch_size=100) @@ -75,17 +81,16 @@ def train_ranknet(num_passes): # Define end batch and end pass event handler def event_handler(event): if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 100 == 0: - print "Pass %d Batch %d Cost %.9f" % ( - event.pass_id, event.batch_id, event.cost) - else: - sys.stdout.write(".") - sys.stdout.flush() + if event.batch_id % 25 == 0: + logger.info("Pass %d Batch %d Cost %.9f" % + (event.pass_id, event.batch_id, event.cost)) if isinstance(event, paddle.event.EndPass): result = trainer.test(reader=test_reader, feeding=feeding) - print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics) - with gzip.open("ranknet_params_%d.tar.gz" % (event.pass_id), - "w") as f: + logger.info("\nTest with Pass %d, %s" % + (event.pass_id, result.metrics)) + with gzip.open( + os.path.join(model_save_dir, "ranknet_params_%d.tar.gz" % + (event.pass_id)), "w") as f: trainer.save_parameter_to_tar(f) trainer.train( @@ -95,18 +100,17 @@ def event_handler(event): num_passes=num_passes) -def ranknet_infer(pass_id): +def ranknet_infer(model_path): """ - load the trained model. And predict with plain txt input - """ - print "Begin to Infer..." + load the trained model. And predict with plain txt input + """ + logger.info("Begin to Infer...") feature_dim = 46 # we just need half_ranknet to predict a rank score, # which can be used in sort documents output = half_ranknet("infer", feature_dim) - parameters = paddle.parameters.Parameters.from_tar( - gzip.open("ranknet_params_%d.tar.gz" % (pass_id))) + parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path)) # load data of same query and relevance documents, # need ranknet to rank these candidates @@ -133,16 +137,55 @@ def ranknet_infer(pass_id): print "query_id : ", query_id, " ranknet rank document order : ", score -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Ranknet demo') - parser.add_argument("--run_type", type=str, help="run type is train|infer") +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="PaddlePaddle RankNet example.") + parser.add_argument( + "--run_type", + type=str, + help=("A flag indicating to run the training or the inferring task. " + "Available options are: train or infer."), + default="train") parser.add_argument( "--num_passes", type=int, - help="num of passes in train| infer pass number of model") + help="The number of passes to train the model.", + default=10) + parser.add_argument( + "--use_gpu", + type=bool, + help="A flag indicating whether to use the GPU device in training.", + default=False) + parser.add_argument( + "--trainer_count", + type=int, + help="The thread number used in training.", + default=1) + parser.add_argument( + "--model_save_dir", + type=str, + required=False, + help=("The path to save the trained models."), + default="models") + parser.add_argument( + "--test_model_path", + type=str, + required=False, + help=("This parameter works only in inferring task to " + "specify path of a trained model."), + default="") + args = parser.parse_args() - paddle.init(use_gpu=False, trainer_count=4) + if not os.path.exists(args.model_save_dir): os.mkdir(args.model_save_dir) + + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + if args.run_type == "train": - train_ranknet(args.num_passes) + ranknet_train(args.num_passes, args.model_save_dir) elif args.run_type == "infer": - ranknet_infer(pass_id=args.pass_num - 1) + assert os.path.exists( + args.test_model_path), "The trained model does not exit." + ranknet_infer(args.test_model_path) + else: + logger.fatal(("A wrong value for parameter run type. " + "Available options are: train or infer.")) diff --git a/ltr/run_lambdarank.sh b/ltr/run_lambdarank.sh deleted file mode 100644 index 9546be2cb1f..00000000000 --- a/ltr/run_lambdarank.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh - -python lambda_rank.py \ - --run_type="train" \ - --num_passes=10 \ - 2>&1 | tee lambdarank_train.log - -python lambda_rank.py \ - --run_type="infer" \ - --num_passes=10 \ - 2>&1 | tee lambdarank_infer.log diff --git a/ltr/run_ranknet.sh b/ltr/run_ranknet.sh deleted file mode 100644 index 8c574ffd4b0..00000000000 --- a/ltr/run_ranknet.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh - -python ranknet.py \ - --run_type="train" \ - --num_passes=10 \ - 2>&1 | tee ranknet_train.log - -python ranknet.py \ - --run_type="infer" \ - --num_passes=10 \ - 2>&1 | tee ranknet_infer.log diff --git a/text_classification/utils.py b/text_classification/utils.py index d14054d3319..c9b0a854501 100644 --- a/text_classification/utils.py +++ b/text_classification/utils.py @@ -9,60 +9,60 @@ def parse_train_cmd(): parser = argparse.ArgumentParser( - description="PaddlePaddle text classification demo") + description="PaddlePaddle text classification example.") parser.add_argument( "--nn_type", type=str, - help="define which type of network to use, available: [dnn, cnn]", + help=("A flag that defines which type of network to use, " + "available: [dnn, cnn]."), default="dnn") parser.add_argument( "--train_data_dir", type=str, required=False, - help=("path of training dataset (default: None). " - "if this parameter is not set, " - "paddle.dataset.imdb will be used."), + help=("The path of training dataset (default: None). If this parameter " + "is not set, paddle.dataset.imdb will be used."), default=None) parser.add_argument( "--test_data_dir", type=str, required=False, - help=("path of testing dataset (default: None). " - "if this parameter is not set, " - "paddle.dataset.imdb will be used."), + help=("The path of testing dataset (default: None). If this parameter " + "is not set, paddle.dataset.imdb will be used."), default=None) parser.add_argument( "--word_dict", type=str, required=False, - help=("path of word dictionary (default: None)." - "if this parameter is not set, paddle.dataset.imdb will be used." - "if this parameter is set, but the file does not exist, " - "word dictionay will be built from " - "the training data automatically."), + help=("The path of word dictionary (default: None). If this parameter " + "is not set, paddle.dataset.imdb will be used. If this parameter " + "is set, but the file does not exist, word dictionay " + "will be built from the training data automatically."), default=None) parser.add_argument( "--label_dict", type=str, required=False, - help=("path of label dictionay (default: None)." - "if this parameter is not set, paddle.dataset.imdb will be used." - "if this parameter is set, but the file does not exist, " - "word dictionay will be built from " - "the training data automatically."), + help=("The path of label dictionay (default: None).If this parameter " + "is not set, paddle.dataset.imdb will be used. If this parameter " + "is set, but the file does not exist, word dictionay " + "will be built from the training data automatically."), default=None) parser.add_argument( "--batch_size", type=int, default=32, - help="the number of training examples in one forward/backward pass") + help="The number of training examples in one forward/backward pass.") parser.add_argument( - "--num_passes", type=int, default=10, help="number of passes to train") + "--num_passes", + type=int, + default=10, + help="The number of passes to train the model.") parser.add_argument( "--model_save_dir", type=str, required=False, - help=("path to save the trained models."), + help=("The path to save the trained models."), default="models") return parser.parse_args()