Merge branch 'master' into organize-language-data

This commit is contained in:
Ines Montani 2016-11-22 19:53:57 +01:00
commit 4edf7057ee
7 changed files with 76 additions and 79 deletions

5
.gitignore vendored
View File

@ -29,10 +29,7 @@ spacy/orthography/*.cpp
ext/murmurhash.cpp
ext/sparsehash.cpp
data/en/pos
data/en/ner
data/en/lexemes
data/en/strings
/spacy/data/
_build/
.env/

View File

@ -1,31 +0,0 @@
def write_parameter(outfile, feats):
"""
From https://github.com/baidu/Paddle/issues/490
outfile: Output file name with string type. **Note**, it should be the same as it in the above config.
feats: Parameter with float type.
"""
version = 0
value_size = 4; # means float type
ret = b""
for feat in feats:
ret += feat.tostring()
size = len(ret) / 4
fo = open(outfile, 'wb')
fo.write(struct.pack('iIQ', version, value_size, size))
fo.write(ret)
# config=trainer_config.py
# output=./model_output
# paddle train --config=$config \
# --save_dir=$output \
# --job=train \
# --use_gpu=false \
# --trainer_count=4 \
# --num_passes=10 \
# --log_period=20 \
# --dot_period=20 \
# --show_parameter_stats_period=100 \
# --test_all_data_in_one_period=1 \
# 2>&1 | tee 'train.log'

View File

@ -1,44 +1,14 @@
from paddle.trainer.PyDataProvider2 import *
from itertools import izip
from paddle.trainer_config_helpers import *
define_py_data_sources2(train_list='train.list',
test_list='test.list',
module="dataprovider",
obj="process")
def get_features(doc):
return numpy.asarray(
[t.rank+1 for t in doc
if t.has_vector and not t.is_punct and not t.is_space],
dtype='int32')
def read_data(data_dir):
for subdir, label in (('pos', 1), ('neg', 0)):
for filename in (data_dir / subdir).iterdir():
with filename.open() as file_:
text = file_.read()
yield text, label
def on_init(settings, lang_name, **kwargs):
print("Loading spaCy")
nlp = spacy.load('en', entity=False)
vectors = get_vectors(nlp)
settings.input_types = [
# The text is a sequence of integer values, and each value is a word id.
# The whole sequence is the sentences that we want to predict its
# sentimental.
integer_value(vectors.shape[0], seq_type=SequenceType), # text input
# label positive/negative
integer_value(2)
]
settings.nlp = nlp
settings.vectors = vectors
@provider(init_hook=on_init)
def process(settings, data_dir): # settings is not used currently.
texts, labels = read_data(data_dir)
for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels):
for sent in doc.sents:
ids = get_features(sent)
# give data to paddle.
yield ids, label
settings(
batch_size=128,
learning_rate=2e-3,
learning_method=AdamOptimizer(),
regularization=L2Regularization(8e-4),
gradient_clipping_threshold=25
)

View File

@ -0,0 +1,46 @@
from paddle.trainer.PyDataProvider2 import *
from itertools import izip
import spacy
def get_features(doc):
return numpy.asarray(
[t.rank+1 for t in doc
if t.has_vector and not t.is_punct and not t.is_space],
dtype='int32')
def read_data(data_dir):
for subdir, label in (('pos', 1), ('neg', 0)):
for filename in (data_dir / subdir).iterdir():
with filename.open() as file_:
text = file_.read()
yield text, label
def on_init(settings, **kwargs):
print("Loading spaCy")
nlp = spacy.load('en', entity=False)
vectors = get_vectors(nlp)
settings.input_types = [
# The text is a sequence of integer values, and each value is a word id.
# The whole sequence is the sentences that we want to predict its
# sentimental.
integer_value(vectors.shape[0], seq_type=SequenceType), # text input
# label positive/negative
integer_value(2)
]
settings.nlp = nlp
settings.vectors = vectors
settings['batch_size'] = 32
@provider(init_hook=on_init)
def process(settings, data_dir): # settings is not used currently.
texts, labels = read_data(data_dir)
for doc, label in izip(nlp.pipe(texts, batch_size=5000, n_threads=3), labels):
for sent in doc.sents:
ids = get_features(sent)
# give data to paddle.
yield ids, label

View File

@ -0,0 +1,14 @@
config=config.py
output=./model_output
paddle train --config=$config \
--save_dir=$output \
--job=train \
--use_gpu=false \
--trainer_count=4 \
--num_passes=10 \
--log_period=20 \
--dot_period=20 \
--show_parameter_stats_period=100 \
--test_all_data_in_one_period=1 \
--config_args=batch_size=100 \
2>&1 | tee 'train.log'_

View File

@ -7,7 +7,7 @@ p
| runs on #[strong Unix/Linux], #[strong macOS/OS X] and
| #[strong Windows]. The latest spaCy releases are currently only
| available as source packages over
| #[+a("https://pypi.python.org/pypi/spacy") pip]. Installaton requires a
| #[+a("https://pypi.python.org/pypi/spacy") pip]. Installation requires a
| working build environment. See notes on
| #[a(href="#source-ubuntu") Ubuntu], #[a(href="#source-osx") macOS/OS X]
| and #[a(href="#source-windows") Windows] for details.

View File

@ -31,7 +31,8 @@ p
p
| I've tried to make sure that the #[code Language.__call__] function
| doesn't do any "heavy lifting", so that you won't have complicated logic
| to replicate if you need to make your own pipeline class. This is all it | does.
| to replicate if you need to make your own pipeline class. This is all it
| does.
p
| The #[code .make_doc()] method and #[code .pipeline] attribute make it