-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
基于Fluid的多线程文本分类程序 #8267
Comments
多线程程序的书写规则目前还没有文档。直接看上面的程序我看不出来错误,不熟悉这一部分,请 @reyoung 帮忙看下问题,感谢。 上面程序中有一个一个和 lstm_h, c = fluid.layers.dynamic_lstm(input=emb, size=hid_dim, is_reverse=False) |
import paddle.fluid as fluid
import paddle.v2 as paddle
word_dict = paddle.dataset.imdb.word_dict()
print('Load Dict Done')
# vocabulary size
dict_dim = len(word_dict)
# embedding dim
emb_dim = 128
# hidden dim
hid_dim = 128
# hidden dim2
hid_dim2 = 96
# class num
class_dim = 2
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
# label data
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
feat_ = pd.read_input(data)
label_ = pd.read_input(label)
emb = fluid.layers.embedding(input=feat_,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(learning_rate=5.0))
lstm_h, c = fluid.layers.dynamic_lstm(input=emb, size=hid_dim, is_reverse=False)
lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
fc1 = fluid.layers.fc(input=lstm_max, size=hid_dim2, act='tanh')
prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
pd.write_output(avg_cost)
pd.write_output(acc)
# avg_cost, prediction = pd()
avg_cost_on_each_devs, acc_on_each_devs = pd()
avg_cost = fluid.layers.mean(x=avg_cost_on_each_devs)
acc = fluid.layers.mean(x=acc_on_each_devs)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
BATCH_SIZE = 4
train_reader = paddle.batch(paddle.dataset.imdb.train(word_dict), batch_size=BATCH_SIZE)
test_reader = paddle.batch(paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
place = fluid.CUDAPlace(0)
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(fluid.default_startup_program())
PASS_NUM = 30
for pass_id in xrange(PASS_NUM):
for data in train_reader():
avg_cost_np, acc_np = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost, acc])
print("avg loss: {0}, Acc: {1}".format(str(avg_cost_np), str(acc_np))) 调用完Parallel.Do之后,需要用 |
最终版本 import paddle.fluid as fluid
import paddle.v2 as paddle
import numpy as np
import sys
def load_vocab(filename):
vocab = {}
with open(filename) as f:
wid = 0
for line in f:
vocab[line.strip()] = wid
wid += 1
return vocab
word_dict = load_vocab(sys.argv[1])
word_dict["<unk>"] = len(word_dict)
#word_dict = paddle.dataset.imdb.word_dict()
print('Load Dict Done')
# vocabulary size
dict_dim = len(word_dict)
# embedding dim
emb_dim = 128
# hidden dim
hid_dim = 128
# hidden dim2
hid_dim2 = 96
# class num
class_dim = 2
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
# label data
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
feat_ = pd.read_input(data)
label_ = pd.read_input(label)
emb = fluid.layers.embedding(input=feat_,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(learning_rate=5.0))
lstm_h, c = fluid.layers.dynamic_lstm(input=emb, size=hid_dim, is_reverse=False)
lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
fc1 = fluid.layers.fc(input=lstm_max, size=hid_dim2, act='tanh')
prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
pd.write_output(avg_cost)
pd.write_output(acc)
# avg_cost, prediction = pd()
avg_cost_on_each_devs, acc_on_each_devs = pd()
avg_cost = fluid.layers.mean(x=avg_cost_on_each_devs)
acc = fluid.layers.mean(x=acc_on_each_devs)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
BATCH_SIZE = 4
#train_reader = paddle.batch(paddle.dataset.imdb.train(word_dict), batch_size=BATCH_SIZE)
#test_reader = paddle.batch(paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=25000),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.test(word_dict), buf_size=25000),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(fluid.default_startup_program())
PASS_NUM = 30
for pass_id in xrange(PASS_NUM):
avg_cost_list, avg_acc_list = [], []
for data in train_reader():
avg_cost_np, acc_np = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost, acc])
avg_cost_list.append(avg_cost_np)
avg_acc_list.append(acc_np)
print("avg loss: {0}, Acc: {1}".format(str(np.mean(avg_cost_list)), str(np.mean(avg_acc_list))))
#print("avg loss: {0}, Acc: {1}".format(str(avg_cost_np), str(acc_np))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
尝试编写的多线程程序进行文本分类的例子,无法跑通,错误日志的信息难以帮助定位问题,请fluid内核程序相关同学看看。另外,多线程程序的书写规则是否有对应文档支持?
以上例子在docker环境中无法运行,commit id
commit 8dbbc9d
Merge: efc094f 13922fb
Author: Tao Luo luotao02@baidu.com
Date: Mon Feb 26 15:26:56 2018 +0800
提示的问题
The text was updated successfully, but these errors were encountered: