2016-11-20 05:35:23 +03:00
|
|
|
from paddle.trainer.PyDataProvider2 import *
|
2016-11-20 05:45:51 +03:00
|
|
|
from itertools import izip
|
2016-11-20 05:35:23 +03:00
|
|
|
|
|
|
|
|
|
|
|
def get_features(doc):
|
|
|
|
return numpy.asarray(
|
|
|
|
[t.rank+1 for t in doc
|
|
|
|
if t.has_vector and not t.is_punct and not t.is_space],
|
|
|
|
dtype='int32')
|
|
|
|
|
|
|
|
|
2016-11-20 05:45:51 +03:00
|
|
|
def read_data(data_dir):
|
|
|
|
for subdir, label in (('pos', 1), ('neg', 0)):
|
|
|
|
for filename in (data_dir / subdir).iterdir():
|
|
|
|
with filename.open() as file_:
|
|
|
|
text = file_.read()
|
|
|
|
yield text, label
|
|
|
|
|
|
|
|
|
2016-11-20 05:35:23 +03:00
|
|
|
def on_init(settings, lang_name, **kwargs):
|
|
|
|
print("Loading spaCy")
|
|
|
|
nlp = spacy.load('en', entity=False)
|
|
|
|
vectors = get_vectors(nlp)
|
|
|
|
settings.input_types = [
|
|
|
|
# The text is a sequence of integer values, and each value is a word id.
|
|
|
|
# The whole sequence is the sentences that we want to predict its
|
|
|
|
# sentimental.
|
|
|
|
integer_value(vectors.shape[0], seq_type=SequenceType), # text input
|
|
|
|
|
|
|
|
# label positive/negative
|
|
|
|
integer_value(2)
|
|
|
|
]
|
|
|
|
settings.nlp = nlp
|
|
|
|
settings.vectors = vectors
|
|
|
|
|
|
|
|
|
|
|
|
@provider(init_hook=on_init)
|
|
|
|
def process(settings, data_dir): # settings is not used currently.
|
|
|
|
texts, labels = read_data(data_dir)
|
2016-11-20 05:45:51 +03:00
|
|
|
for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels):
|
2016-11-20 05:35:23 +03:00
|
|
|
for sent in doc.sents:
|
|
|
|
ids = get_features(sent)
|
|
|
|
# give data to paddle.
|
|
|
|
yield ids, label
|