spaCy/examples/paddle/sentiment_bilstm/dataprovider.py

from paddle.trainer.PyDataProvider2 import *
from itertools import izip


def get_features(doc):
    return numpy.asarray(
        [t.rank+1 for t in doc
         if t.has_vector and not t.is_punct and not t.is_space],
        dtype='int32')


def read_data(data_dir):
    for subdir, label in (('pos', 1), ('neg', 0)):
        for filename in (data_dir / subdir).iterdir():
            with filename.open() as file_:
                text = file_.read()
                yield text, label


def on_init(settings, lang_name, **kwargs):
    print("Loading spaCy")
    nlp = spacy.load('en', entity=False)
    vectors = get_vectors(nlp)
    settings.input_types = [
        # The text is a sequence of integer values, and each value is a word id.
        # The whole sequence is the sentences that we want to predict its
        # sentimental.
        integer_value(vectors.shape[0], seq_type=SequenceType),  # text input

        # label positive/negative
        integer_value(2)
    ]
    settings.nlp = nlp
    settings.vectors = vectors


@provider(init_hook=on_init)
def process(settings, data_dir):  # settings is not used currently.
    texts, labels = read_data(data_dir)
    for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels):
        for sent in doc.sents:
            ids = get_features(sent)
            # give data to paddle.
            yield ids, label
Add paddle sentiment example 2016-11-20 05:35:23 +03:00			`from paddle.trainer.PyDataProvider2 import *`
Update config.py 2016-11-20 05:45:51 +03:00			`from itertools import izip`
Add paddle sentiment example 2016-11-20 05:35:23 +03:00

			`def get_features(doc):`
			`return numpy.asarray(`
			`[t.rank+1 for t in doc`
			`if t.has_vector and not t.is_punct and not t.is_space],`
			`dtype='int32')`


Update config.py 2016-11-20 05:45:51 +03:00			`def read_data(data_dir):`
			`for subdir, label in (('pos', 1), ('neg', 0)):`
			`for filename in (data_dir / subdir).iterdir():`
			`with filename.open() as file_:`
			`text = file_.read()`
			`yield text, label`


Add paddle sentiment example 2016-11-20 05:35:23 +03:00			`def on_init(settings, lang_name, **kwargs):`
			`print("Loading spaCy")`
			`nlp = spacy.load('en', entity=False)`
			`vectors = get_vectors(nlp)`
			`settings.input_types = [`
			`# The text is a sequence of integer values, and each value is a word id.`
			`# The whole sequence is the sentences that we want to predict its`
			`# sentimental.`
			`integer_value(vectors.shape[0], seq_type=SequenceType), # text input`

			`# label positive/negative`
			`integer_value(2)`
			`]`
			`settings.nlp = nlp`
			`settings.vectors = vectors`


			`@provider(init_hook=on_init)`
			`def process(settings, data_dir): # settings is not used currently.`
			`texts, labels = read_data(data_dir)`
Update config.py 2016-11-20 05:45:51 +03:00			`for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels):`
Add paddle sentiment example 2016-11-20 05:35:23 +03:00			`for sent in doc.sents:`
			`ids = get_features(sent)`
			`# give data to paddle.`
			`yield ids, label`