Merge branch 'master' into organize-language-data

2025-08-23 21:44:54 +03:00 · 2016-11-22 19:53:57 +01:00 · 2016-11-22 19:53:57 +01:00 · 4edf7057ee
commit 4edf7057ee
parent e813c73010 ede2baba19
7 changed files with 76 additions and 79 deletions
--- a/.gitignore
+++ b/.gitignore
@ -29,10 +29,7 @@ spacy/orthography/*.cpp
 ext/murmurhash.cpp
 ext/sparsehash.cpp
-data/en/pos
+/spacy/data/
 data/en/ner
 data/en/lexemes
 data/en/strings
 _build/
 .env/
--- a/examples/paddle/sentiment_bilstm/main.py
+++ b/examples/paddle/sentiment_bilstm/main.py
@ -1,31 +0,0 @@
 def write_parameter(outfile, feats):
    """
    From https://github.com/baidu/Paddle/issues/490
    outfile: Output file name with string type. **Note**, it should be the same as it in the above config.
    feats: Parameter with float type.
    """
    version = 0
    value_size  = 4; # means float type
    ret = b""
    for feat in feats:
        ret += feat.tostring()
    size = len(ret) / 4
    fo = open(outfile, 'wb')
    fo.write(struct.pack('iIQ', version, value_size, size))
    fo.write(ret)
 # config=trainer_config.py
 # output=./model_output
 # paddle train --config=$config \
 #              --save_dir=$output \
 #              --job=train \
 #              --use_gpu=false \
 #              --trainer_count=4 \
 #              --num_passes=10 \
 #              --log_period=20 \
 #              --dot_period=20 \
 #              --show_parameter_stats_period=100 \
 #              --test_all_data_in_one_period=1 \
 #              2>&1 | tee 'train.log'
--- a/examples/paddle/sentiment_bilstm/config.py
+++ b/examples/paddle/sentiment_bilstm/config.py
@ -1,44 +1,14 @@
-from paddle.trainer.PyDataProvider2 import *
+from paddle.trainer_config_helpers import *
 from itertools import izip
 define_py_data_sources2(train_list='train.list',
                        test_list='test.list',
                        module="dataprovider",
                        obj="process")
-def get_features(doc):
+settings(
-    return numpy.asarray(
+  batch_size=128,
-        [t.rank+1 for t in doc
+  learning_rate=2e-3,
-         if t.has_vector and not t.is_punct and not t.is_space],
+  learning_method=AdamOptimizer(),
-        dtype='int32')
+  regularization=L2Regularization(8e-4),
-
+  gradient_clipping_threshold=25
-
+)
 def read_data(data_dir):
    for subdir, label in (('pos', 1), ('neg', 0)):
        for filename in (data_dir / subdir).iterdir():
            with filename.open() as file_:
                text = file_.read()
                yield text, label
 def on_init(settings, lang_name, **kwargs):
    print("Loading spaCy")
    nlp = spacy.load('en', entity=False)
    vectors = get_vectors(nlp)
    settings.input_types = [
        # The text is a sequence of integer values, and each value is a word id.
        # The whole sequence is the sentences that we want to predict its
        # sentimental.
        integer_value(vectors.shape[0], seq_type=SequenceType),  # text input
        # label positive/negative
        integer_value(2)
    ]
    settings.nlp = nlp
    settings.vectors = vectors
@provider(init_hook=on_init)
 def process(settings, data_dir):  # settings is not used currently.
    texts, labels = read_data(data_dir)
    for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels):
        for sent in doc.sents:
            ids = get_features(sent)
            # give data to paddle.
            yield ids, label
--- a/examples/paddle/sentiment_bilstm/dataprovider.py
+++ b/examples/paddle/sentiment_bilstm/dataprovider.py
@ -0,0 +1,46 @@
 from paddle.trainer.PyDataProvider2 import *
 from itertools import izip
 import spacy
 def get_features(doc):
    return numpy.asarray(
        [t.rank+1 for t in doc
         if t.has_vector and not t.is_punct and not t.is_space],
        dtype='int32')
 def read_data(data_dir):
    for subdir, label in (('pos', 1), ('neg', 0)):
        for filename in (data_dir / subdir).iterdir():
            with filename.open() as file_:
                text = file_.read()
                yield text, label
 def on_init(settings, **kwargs):
    print("Loading spaCy")
    nlp = spacy.load('en', entity=False)
    vectors = get_vectors(nlp)
    settings.input_types = [
        # The text is a sequence of integer values, and each value is a word id.
        # The whole sequence is the sentences that we want to predict its
        # sentimental.
        integer_value(vectors.shape[0], seq_type=SequenceType),  # text input
        # label positive/negative
        integer_value(2)
    ]
    settings.nlp = nlp
    settings.vectors = vectors
    settings['batch_size'] = 32
@provider(init_hook=on_init)
 def process(settings, data_dir):  # settings is not used currently.
    texts, labels = read_data(data_dir)
    for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels):
        for sent in doc.sents:
            ids = get_features(sent)
            # give data to paddle.
            yield ids, label
--- a/examples/paddle/sentiment_bilstm/train.sh
+++ b/examples/paddle/sentiment_bilstm/train.sh
@ -0,0 +1,14 @@
 config=config.py
 output=./model_output
 paddle train --config=$config \
              --save_dir=$output \
              --job=train \
              --use_gpu=false \
              --trainer_count=4 \
              --num_passes=10 \
              --log_period=20 \
              --dot_period=20 \
              --show_parameter_stats_period=100 \
              --test_all_data_in_one_period=1 \
              --config_args=batch_size=100 \
              2>&1 | tee 'train.log'_
--- a/website/docs/usage/index.jade
+++ b/website/docs/usage/index.jade
@ -7,7 +7,7 @@ p
    |  runs on #[strong Unix/Linux], #[strong macOS/OS X] and
    |  #[strong Windows]. The latest spaCy releases are currently only
    |  available as source packages over
-    |  #[+a("https://pypi.python.org/pypi/spacy") pip]. Installaton requires a
+    |  #[+a("https://pypi.python.org/pypi/spacy") pip]. Installation requires a
    |  working build environment. See notes on
    |  #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
    |  and #[a(href="#source-windows") Windows] for details.
--- a/website/docs/usage/processing-text.jade
+++ b/website/docs/usage/processing-text.jade
@ -31,7 +31,8 @@ p
 p
    |  I've tried to make sure that the #[code Language.__call__] function
    |  doesn't do any "heavy lifting", so that you won't have complicated logic
-    |  to replicate if you need to make your own pipeline class. This is all it |  does.
+    |  to replicate if you need to make your own pipeline class. This is all it
    |  does.
 p
    |  The #[code .make_doc()] method and #[code .pipeline] attribute make it