Merge branch 'master' into organize-language-data

2026-01-09 02:01:22 +03:00 · 2016-11-22 19:53:57 +01:00 · 2016-11-22 19:53:57 +01:00 · 4edf7057ee
commit 4edf7057ee
parent e813c73010 ede2baba19
7 changed files with 76 additions and 79 deletions
--- a/.gitignore
+++ b/.gitignore
@ -29,10 +29,7 @@ spacy/orthography/*.cpp
 ext/murmurhash.cpp
 ext/sparsehash.cpp

-data/en/pos
-data/en/ner
-data/en/lexemes
-data/en/strings
+/spacy/data/

 _build/
 .env/
--- a/examples/paddle/sentiment_bilstm/main.py
+++ b/examples/paddle/sentiment_bilstm/main.py
@ -1,31 +0,0 @@
-def write_parameter(outfile, feats):
-    """
-    From https://github.com/baidu/Paddle/issues/490
-
-    outfile: Output file name with string type. **Note**, it should be the same as it in the above config.
-    feats: Parameter with float type.
-    """
-    version = 0
-    value_size  = 4; # means float type
-    ret = b""
-    for feat in feats:
-        ret += feat.tostring()
-    size = len(ret) / 4
-    fo = open(outfile, 'wb')
-    fo.write(struct.pack('iIQ', version, value_size, size))
-    fo.write(ret)
-
-
-# config=trainer_config.py
-# output=./model_output
-# paddle train --config=$config \
-#              --save_dir=$output \
-#              --job=train \
-#              --use_gpu=false \
-#              --trainer_count=4 \
-#              --num_passes=10 \
-#              --log_period=20 \
-#              --dot_period=20 \
-#              --show_parameter_stats_period=100 \
-#              --test_all_data_in_one_period=1 \
-#              2>&1 | tee 'train.log'
--- a/examples/paddle/sentiment_bilstm/config.py
+++ b/examples/paddle/sentiment_bilstm/config.py
@ -1,44 +1,14 @@
-from paddle.trainer.PyDataProvider2 import *
-from itertools import izip
+from paddle.trainer_config_helpers import *

+define_py_data_sources2(train_list='train.list',
+                        test_list='test.list',
+                        module="dataprovider",
+                        obj="process")

-def get_features(doc):
-    return numpy.asarray(
-        [t.rank+1 for t in doc
-         if t.has_vector and not t.is_punct and not t.is_space],
-        dtype='int32')
-
-
-def read_data(data_dir):
-    for subdir, label in (('pos', 1), ('neg', 0)):
-        for filename in (data_dir / subdir).iterdir():
-            with filename.open() as file_:
-                text = file_.read()
-                yield text, label
-
-
-def on_init(settings, lang_name, **kwargs):
-    print("Loading spaCy")
-    nlp = spacy.load('en', entity=False)
-    vectors = get_vectors(nlp)
-    settings.input_types = [
-        # The text is a sequence of integer values, and each value is a word id.
-        # The whole sequence is the sentences that we want to predict its
-        # sentimental.
-        integer_value(vectors.shape[0], seq_type=SequenceType),  # text input
-
-        # label positive/negative
-        integer_value(2)
-    ]
-    settings.nlp = nlp
-    settings.vectors = vectors
-
-
-@provider(init_hook=on_init)
-def process(settings, data_dir):  # settings is not used currently.
-    texts, labels = read_data(data_dir)
-    for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels):
-        for sent in doc.sents:
-            ids = get_features(sent)
-            # give data to paddle.
-            yield ids, label
+settings(
+  batch_size=128,
+  learning_rate=2e-3,
+  learning_method=AdamOptimizer(),
+  regularization=L2Regularization(8e-4),
+  gradient_clipping_threshold=25
+)
--- a/examples/paddle/sentiment_bilstm/dataprovider.py
+++ b/examples/paddle/sentiment_bilstm/dataprovider.py
@ -0,0 +1,46 @@
+from paddle.trainer.PyDataProvider2 import *
+from itertools import izip
+import spacy
+
+
+def get_features(doc):
+    return numpy.asarray(
+        [t.rank+1 for t in doc
+         if t.has_vector and not t.is_punct and not t.is_space],
+        dtype='int32')
+
+
+def read_data(data_dir):
+    for subdir, label in (('pos', 1), ('neg', 0)):
+        for filename in (data_dir / subdir).iterdir():
+            with filename.open() as file_:
+                text = file_.read()
+                yield text, label
+
+
+def on_init(settings, **kwargs):
+    print("Loading spaCy")
+    nlp = spacy.load('en', entity=False)
+    vectors = get_vectors(nlp)
+    settings.input_types = [
+        # The text is a sequence of integer values, and each value is a word id.
+        # The whole sequence is the sentences that we want to predict its
+        # sentimental.
+        integer_value(vectors.shape[0], seq_type=SequenceType),  # text input
+
+        # label positive/negative
+        integer_value(2)
+    ]
+    settings.nlp = nlp
+    settings.vectors = vectors
+    settings['batch_size'] = 32
+
+
+@provider(init_hook=on_init)
+def process(settings, data_dir):  # settings is not used currently.
+    texts, labels = read_data(data_dir)
+    for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels):
+        for sent in doc.sents:
+            ids = get_features(sent)
+            # give data to paddle.
+            yield ids, label
--- a/examples/paddle/sentiment_bilstm/train.sh
+++ b/examples/paddle/sentiment_bilstm/train.sh
@ -0,0 +1,14 @@
+config=config.py
+output=./model_output
+paddle train --config=$config \
+              --save_dir=$output \
+              --job=train \
+              --use_gpu=false \
+              --trainer_count=4 \
+              --num_passes=10 \
+              --log_period=20 \
+              --dot_period=20 \
+              --show_parameter_stats_period=100 \
+              --test_all_data_in_one_period=1 \
+              --config_args=batch_size=100 \
+              2>&1 | tee 'train.log'_
--- a/website/docs/usage/index.jade
+++ b/website/docs/usage/index.jade
@ -7,7 +7,7 @@ p
    |  runs on #[strong Unix/Linux], #[strong macOS/OS X] and
    |  #[strong Windows]. The latest spaCy releases are currently only
    |  available as source packages over
-    |  #[+a("https://pypi.python.org/pypi/spacy") pip]. Installaton requires a
+    |  #[+a("https://pypi.python.org/pypi/spacy") pip]. Installation requires a
    |  working build environment. See notes on
    |  #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
    |  and #[a(href="#source-windows") Windows] for details.
--- a/website/docs/usage/processing-text.jade
+++ b/website/docs/usage/processing-text.jade
@ -31,7 +31,8 @@ p
 p
    |  I've tried to make sure that the #[code Language.__call__] function
    |  doesn't do any "heavy lifting", so that you won't have complicated logic
-    |  to replicate if you need to make your own pipeline class. This is all it |  does.
+    |  to replicate if you need to make your own pipeline class. This is all it
+    |  does.

 p
    |  The #[code .make_doc()] method and #[code .pipeline] attribute make it