Merge pull request #485 from will-am/deep_fm

Implement DeepFM for CTR prediction
PaddlePaddle · Nov 27, 2017 · 061585a · 061585a
2 parents d1b3759 + 7b83fa4
commit 061585a
Show file tree

Hide file tree

Showing 7 changed files with 565 additions and 0 deletions.
diff --git a/deep_fm/README.md b/deep_fm/README.md
@@ -0,0 +1,88 @@
+# Deep Factorization Machine for Click-Through Rate prediction
+
+## Introduction
+This model implements the DeepFM proposed in the following paper:
+
+```text
+Huifeng Guo, Ruiming Tang, Yunming Ye, Zhenguo Li and Xiuqiang He. DeepFM:
+A Factorization-Machine based Neural Network for CTR Prediction. Proceedings
+of the Twenty-Sixth International Joint Conference on Artificial Intelligence
+(IJCAI-17), 2017
+```
+
+The DeepFm combines factorization machine and deep neural networks to model
+both low order and high order feature interactions. For details of the
+factorization machines, please refer to the paper [factorization
+machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)
+
+## Dataset
+This example uses Criteo dataset which was used for the [Display Advertising
+Challenge](https://www.kaggle.com/c/criteo-display-ad-challenge/)
+hosted by Kaggle.
+
+Each row is the features for an ad display and the first column is a label
+indicating whether this ad has been clicked or not. There are 39 features in
+total. 13 features take integer values and the other 26 features are
+categorical features. For the test dataset, the labels are omitted.
+
+Download dataset:
+```bash
+cd data && ./download.sh && cd ..
+```
+
+## Model
+The DeepFM model is composed of the factorization machine layer (FM) and deep
+neural networks (DNN). All the input features are feeded to both FM and DNN.
+The output from FM and DNN are combined to form the final output. The embedding
+layer for sparse features in the DNN shares the parameters with the latent
+vectors (factors) of the FM layer.
+
+The factorization machine layer in PaddlePaddle computes the second order
+interactions. The following code example combines the factorization machine
+layer and fully connected layer to form the full version of factorization
+machine:
+
+```python
+def fm_layer(input, factor_size):
+    first_order = paddle.layer.fc(input=input, size=1, act=paddle.activation.Linear())
+    second_order = paddle.layer.factorization_machine(input=input, factor_size=factor_size)
+    fm = paddle.layer.addto(input=[first_order, second_order],
+                            act=paddle.activation.Linear(),
+                            bias_attr=False)
+    return fm
+```
+
+## Data preparation
+To preprocess the raw dataset, the integer features are clipped then min-max
+normalized to [0, 1] and the categorical features are one-hot encoded. The raw
+training dataset are splited such that 90% are used for training and the other
+10% are used for validation during training.
+
+```bash
+python preprocess.py --datadir ./data/raw --outdir ./data
+```
+
+## Train
+The command line options for training can be listed by `python train.py -h`.
+
+To train the model:
+```bash
+python train.py \
+        --train_data_path data/train.txt \
+        --test_data_path data/valid.txt \
+        2>&1 | train.log
+```
+
+After training pass 9 batch 40000, the testing AUC is `0.807178` and the testing
+cost is `0.445196`.
+
+## Infer
+The command line options for infering can be listed by `python infer.py -h`.
+
+To make inference for the test dataset:
+```bash
+python infer.py \
+        --model_gz_path models/model-pass-9-batch-10000.tar.gz \
+        --data_path data/test.txt \
+        --prediction_output_path ./predict.txt
+```
diff --git a/deep_fm/data/download.sh b/deep_fm/data/download.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+wget --no-check-certificate https://s3-eu-west-1.amazonaws.com/criteo-labs/dac.tar.gz
+tar zxf dac.tar.gz
+rm -f dac.tar.gz
+
+mkdir raw
+mv ./*.txt raw/
diff --git a/deep_fm/infer.py b/deep_fm/infer.py
@@ -0,0 +1,63 @@
+import os
+import gzip
+import argparse
+import itertools
+
+import paddle.v2 as paddle
+
+from network_conf import DeepFM
+import reader
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="PaddlePaddle DeepFM example")
+    parser.add_argument(
+        '--model_gz_path',
+        type=str,
+        required=True,
+        help="The path of model parameters gz file")
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        required=True,
+        help="The path of the dataset to infer")
+    parser.add_argument(
+        '--prediction_output_path',
+        type=str,
+        required=True,
+        help="The path to output the prediction")
+    parser.add_argument(
+        '--factor_size',
+        type=int,
+        default=10,
+        help="The factor size for the factorization machine (default:10)")
+
+    return parser.parse_args()
+
+
+def infer():
+    args = parse_args()
+
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    model = DeepFM(args.factor_size, infer=True)
+
+    parameters = paddle.parameters.Parameters.from_tar(
+        gzip.open(args.model_gz_path, 'r'))
+
+    inferer = paddle.inference.Inference(
+        output_layer=model, parameters=parameters)
+
+    dataset = reader.Dataset()
+
+    infer_reader = paddle.batch(dataset.infer(args.data_path), batch_size=1000)
+
+    with open(args.prediction_output_path, 'w') as out:
+        for id, batch in enumerate(infer_reader()):
+            res = inferer.infer(input=batch)
+            predictions = [x for x in itertools.chain.from_iterable(res)]
+            out.write('\n'.join(map(str, predictions)) + '\n')
+
+
+if __name__ == '__main__':
+    infer()
diff --git a/deep_fm/network_conf.py b/deep_fm/network_conf.py
@@ -0,0 +1,76 @@
+import paddle.v2 as paddle
+
+dense_feature_dim = 13
+sparse_feature_dim = 117568
+
+
+def fm_layer(input, factor_size, fm_param_attr):
+    first_order = paddle.layer.fc(
+        input=input, size=1, act=paddle.activation.Linear())
+    second_order = paddle.layer.factorization_machine(
+        input=input,
+        factor_size=factor_size,
+        act=paddle.activation.Linear(),
+        param_attr=fm_param_attr)
+    out = paddle.layer.addto(
+        input=[first_order, second_order],
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return out
+
+
+def DeepFM(factor_size, infer=False):
+    dense_input = paddle.layer.data(
+        name="dense_input",
+        type=paddle.data_type.dense_vector(dense_feature_dim))
+    sparse_input = paddle.layer.data(
+        name="sparse_input",
+        type=paddle.data_type.sparse_binary_vector(sparse_feature_dim))
+    sparse_input_ids = [
+        paddle.layer.data(
+            name="C" + str(i),
+            type=paddle.data_type.integer_value(sparse_feature_dim))
+        for i in range(1, 27)
+    ]
+
+    dense_fm = fm_layer(
+        dense_input,
+        factor_size,
+        fm_param_attr=paddle.attr.Param(name="DenseFeatFactors"))
+    sparse_fm = fm_layer(
+        sparse_input,
+        factor_size,
+        fm_param_attr=paddle.attr.Param(name="SparseFeatFactors"))
+
+    def embedding_layer(input):
+        return paddle.layer.embedding(
+            input=input,
+            size=factor_size,
+            param_attr=paddle.attr.Param(name="SparseFeatFactors"))
+
+    sparse_embed_seq = map(embedding_layer, sparse_input_ids)
+    sparse_embed = paddle.layer.concat(sparse_embed_seq)
+
+    fc1 = paddle.layer.fc(
+        input=[sparse_embed, dense_input],
+        size=400,
+        act=paddle.activation.Relu())
+    fc2 = paddle.layer.fc(input=fc1, size=400, act=paddle.activation.Relu())
+    fc3 = paddle.layer.fc(input=fc2, size=400, act=paddle.activation.Relu())
+
+    predict = paddle.layer.fc(
+        input=[dense_fm, sparse_fm, fc3],
+        size=1,
+        act=paddle.activation.Sigmoid())
+
+    if not infer:
+        label = paddle.layer.data(
+            name="label", type=paddle.data_type.dense_vector(1))
+        cost = paddle.layer.multi_binary_label_cross_entropy_cost(
+            input=predict, label=label)
+        paddle.evaluator.classification_error(
+            name="classification_error", input=predict, label=label)
+        paddle.evaluator.auc(name="auc", input=predict, label=label)
+        return cost
+    else:
+        return predict