Merge branch 'develop' of https://github.com/PaddlePaddle/book into d…

…evelop
PaddlePaddle · Jan 11, 2017 · 7590464 · 7590464
2 parents c504f69 + da5aac9
commit 7590464
Show file tree

Hide file tree

Showing 15 changed files with 1,320 additions and 2 deletions.
diff --git a/image_classification/README.md b/image_classification/README.md
@@ -96,7 +96,7 @@ NIN模型主要有两个特点：1) 引入了多层感知卷积网络(Multi-Laye
 Inception模块如下图7所示，图(a)是最简单的设计，输出是3个卷积层和一个池化层的特征拼接，这样设计的缺点是池化层不会改变特征通道数，拼接后会导致特征的通道数较大，经过几层这样的模块堆积会导致通道数越来越大，参数和计算量随之增大。为了改善这个缺点，图(b)引入3个1x1卷积层进行降维，所谓的降维就是减少通道数，同时如NIN模型中提到的1x1卷积也可以修正线性特征。
 
 <p align="center">
-<img src="image/inception.png" width="550" ><br/>
+<img src="image/inception.png" width="600" ><br/>
 图7. Inception模块
 </p>
 

diff --git a/understand_sentiment/.gitignore b/understand_sentiment/.gitignore
@@ -0,0 +1,10 @@
+data/aclImdb
+data/imdb
+data/pre-imdb
+data/mosesdecoder-master
+*.log
+model_output
+dataprovider_copy_1.py
+model.list
+*.pyc
+.DS_Store
diff --git a/understand_sentiment/README.md b/understand_sentiment/README.md
diff --git a/understand_sentiment/data/get_imdb.sh b/understand_sentiment/data/get_imdb.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -x
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+#download the dataset
+echo "Downloading aclImdb..."
+#http://ai.stanford.edu/%7Eamaas/data/sentiment/
+wget http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
+
+echo "Downloading mosesdecoder..."
+#https://github.com/moses-smt/mosesdecoder
+wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
+
+#extract package
+echo "Unzipping..."
+tar -zxvf aclImdb_v1.tar.gz
+unzip master.zip
+
+#move train and test set to imdb_data directory 
+#in order to process when traing
+mkdir -p imdb/train
+mkdir -p imdb/test
+
+cp -r aclImdb/train/pos/ imdb/train/pos
+cp -r aclImdb/train/neg/ imdb/train/neg
+
+cp -r aclImdb/test/pos/ imdb/test/pos
+cp -r aclImdb/test/neg/ imdb/test/neg
+
+#remove compressed package
+rm aclImdb_v1.tar.gz
+rm master.zip
+
+echo "Done."
diff --git a/understand_sentiment/dataprovider.py b/understand_sentiment/dataprovider.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer.PyDataProvider2 import *
+
+
+def hook(settings, dictionary, **kwargs):
+    settings.word_dict = dictionary
+    settings.input_types = {
+        'word': integer_value_sequence(len(settings.word_dict)),
+        'label': integer_value(2)
+    }
+    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
+
+
+@provider(init_hook=hook)
+def process(settings, file_name):
+    with open(file_name, 'r') as fdata:
+        for line_count, line in enumerate(fdata):
+            label, comment = line.strip().split('\t\t')
+            label = int(label)
+            words = comment.split()
+            word_slot = [
+                settings.word_dict[w] for w in words if w in settings.word_dict
+            ]
+            yield {'word': word_slot, 'label': label}
diff --git a/understand_sentiment/image/lstm.png b/understand_sentiment/image/lstm.png
diff --git a/understand_sentiment/image/rnn.png b/understand_sentiment/image/rnn.png
diff --git a/understand_sentiment/image/stacked_lstm.jpg b/understand_sentiment/image/stacked_lstm.jpg
diff --git a/understand_sentiment/image/text_cnn.png b/understand_sentiment/image/text_cnn.png
diff --git a/understand_sentiment/predict.py b/understand_sentiment/predict.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os, sys
+import numpy as np
+from optparse import OptionParser
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import integer_value_sequence
+from paddle.trainer.config_parser import parse_config
+"""
+Usage: run following command to show help message.
+  python predict.py -h
+"""
+
+
+class SentimentPrediction():
+    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
+        """
+        train_conf: trainer configure.
+        dict_file: word dictionary file name.
+        model_dir: directory of model.
+        """
+        self.train_conf = train_conf
+        self.dict_file = dict_file
+        self.word_dict = {}
+        self.dict_dim = self.load_dict()
+        self.model_dir = model_dir
+        if model_dir is None:
+            self.model_dir = os.path.dirname(train_conf)
+
+        self.label = None
+        if label_file is not None:
+            self.load_label(label_file)
+
+        conf = parse_config(train_conf, "is_predict=1")
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(
+            conf.model_config)
+        self.network.loadParameters(self.model_dir)
+        input_types = [integer_value_sequence(self.dict_dim)]
+        self.converter = DataProviderConverter(input_types)
+
+    def load_dict(self):
+        """
+        Load dictionary from self.dict_file.
+        """
+        for line_count, line in enumerate(open(self.dict_file, 'r')):
+            self.word_dict[line.strip().split('\t')[0]] = line_count
+        return len(self.word_dict)
+
+    def load_label(self, label_file):
+        """
+        Load label.
+        """
+        self.label = {}
+        for v in open(label_file, 'r'):
+            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
+
+    def get_index(self, data):
+        """
+        transform word into integer index according to the dictionary.
+        """
+        words = data.strip().split()
+        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
+        return word_slot
+
+    def batch_predict(self, data_batch):
+        input = self.converter(data_batch)
+        output = self.network.forwardTest(input)
+        prob = output[0]["value"]
+        labs = np.argsort(-prob)
+        for idx, lab in enumerate(labs):
+            if self.label is None:
+                print("predicting label is %d" % (lab[0]))
+            else:
+                print("predicting label is %s" % (self.label[lab[0]]))
+
+
+def option_parser():
+    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
+    parser = OptionParser(usage="usage: %s [options]" % usage)
+    parser.add_option(
+        "-n",
+        "--tconf",
+        action="store",
+        dest="train_conf",
+        help="network config")
+    parser.add_option(
+        "-d",
+        "--dict",
+        action="store",
+        dest="dict_file",
+        help="dictionary file")
+    parser.add_option(
+        "-b",
+        "--label",
+        action="store",
+        dest="label",
+        default=None,
+        help="dictionary file")
+    parser.add_option(
+        "-c",
+        "--batch_size",
+        type="int",
+        action="store",
+        dest="batch_size",
+        default=1,
+        help="the batch size for prediction")
+    parser.add_option(
+        "-w",
+        "--model",
+        action="store",
+        dest="model_path",
+        default=None,
+        help="model path")
+    return parser.parse_args()
+
+
+def main():
+    options, args = option_parser()
+    train_conf = options.train_conf
+    batch_size = options.batch_size
+    dict_file = options.dict_file
+    model_path = options.model_path
+    label = options.label
+    swig_paddle.initPaddle("--use_gpu=0")
+    predict = SentimentPrediction(train_conf, dict_file, model_path, label)
+
+    batch = []
+    for line in sys.stdin:
+        batch.append([predict.get_index(line)])
+        if len(batch) == batch_size:
+            predict.batch_predict(batch)
+            batch = []
+    if len(batch) > 0:
+        predict.batch_predict(batch)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/understand_sentiment/predict.sh b/understand_sentiment/predict.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+#Note the default model is pass-00002, you shold make sure the model path
+#exists or change the mode path.
+model=model_output/pass-00002/
+config=trainer_config.py
+label=data/pre-imdb/labels.list
+cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
+     --tconf=$config \
+     --model=$model \
+     --label=$label \
+     --dict=./data/pre-imdb/dict.txt \
+     --batch_size=1