Add paddle sentiment example

This commit is contained in:
Matthew Honnibal 2016-11-20 03:35:23 +01:00
parent e7eac08819
commit 409a18bd42
3 changed files with 86 additions and 0 deletions

View File

@ -0,0 +1,31 @@
def write_parameter(outfile, feats):
"""
From https://github.com/baidu/Paddle/issues/490
outfile: Output file name with string type. **Note**, it should be the same as it in the above config.
feats: Parameter with float type.
"""
version = 0
value_size = 4; # means float type
ret = b""
for feat in feats:
ret += feat.tostring()
size = len(ret) / 4
fo = open(outfile, 'wb')
fo.write(struct.pack('iIQ', version, value_size, size))
fo.write(ret)
# config=trainer_config.py
# output=./model_output
# paddle train --config=$config \
# --save_dir=$output \
# --job=train \
# --use_gpu=false \
# --trainer_count=4 \
# --num_passes=10 \
# --log_period=20 \
# --dot_period=20 \
# --show_parameter_stats_period=100 \
# --test_all_data_in_one_period=1 \
# 2>&1 | tee 'train.log'

View File

@ -0,0 +1,36 @@
from paddle.trainer.PyDataProvider2 import *
def get_features(doc):
return numpy.asarray(
[t.rank+1 for t in doc
if t.has_vector and not t.is_punct and not t.is_space],
dtype='int32')
def on_init(settings, lang_name, **kwargs):
print("Loading spaCy")
nlp = spacy.load('en', entity=False)
vectors = get_vectors(nlp)
settings.input_types = [
# The text is a sequence of integer values, and each value is a word id.
# The whole sequence is the sentences that we want to predict its
# sentimental.
integer_value(vectors.shape[0], seq_type=SequenceType), # text input
# label positive/negative
integer_value(2)
]
settings.nlp = nlp
settings.vectors = vectors
@provider(init_hook=on_init)
def process(settings, data_dir): # settings is not used currently.
texts, labels = read_data(data_dir)
for doc, label in zip(nlp.pipe(train_texts, batch_size=5000, n_threads=3),
labels):
for sent in doc.sents:
ids = get_features(sent)
# give data to paddle.
yield ids, label

View File

@ -0,0 +1,19 @@
from paddle.trainer_config_helpers import *
def bidirectional_lstm_net(input_dim,
class_dim=2,
emb_dim=128,
lstm_dim=128,
is_predict=False):
data = data_layer("word", input_dim)
emb = embedding_layer(input=data, size=emb_dim)
bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
output = fc_layer(input=dropout, size=class_dim, act=SoftmaxActivation())
if not is_predict:
lbl = data_layer("label", 1)
outputs(classification_cost(input=output, label=lbl))
else:
outputs(output)